In [6]:
%load_ext autoreload
%autoreload 2

In [37]:

import pandas as pd
import numpy as np
import matplotlib as plt
import bz2
import os
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

In [29]:
print(os.listdir("../raw_data"))

print(os.listdir("../raw_data/test.ft.txt.bz2"))

['test.ft.txt.bz2']
['test.ft.txt.bz2', 'test.ft.txt.bz2:Zone.Identifier']


In [9]:
file_path = "../raw_data/test.ft.txt.bz2/test.ft.txt.bz2"

data = []
with bz2.open(file_path, "rt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" ", 1)
        if len(parts) == 2:
            label, text = parts
            label = label.replace("__label__", "")
            data.append((label, text))

df = pd.DataFrame(data, columns=["label", "text"])
df["label"] = df["label"].astype(int)
df.head()

Unnamed: 0,label,text
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   400000 non-null  int64 
 1   text    400000 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.1+ MB


In [11]:
df["label"].value_counts()

label
2    200000
1    200000
Name: count, dtype: int64

In [12]:
df["text"].str.len().describe()

count    400000.000000
mean        431.429630
std         237.435383
min          99.000000
25%         231.000000
50%         383.000000
75%         595.000000
max        1015.000000
Name: text, dtype: float64

In [13]:
df.isnull().sum()

label    0
text     0
dtype: int64

##Cleaning

In [30]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/vin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df["text"].apply(clean_text)

In [15]:
df[["text", "clean_text"]].head(10)

Unnamed: 0,text,clean_text
0,Great CD: My lovely Pat has one of the GREAT v...,great cd my lovely pat has one of the great vo...
1,One of the best game music soundtracks - for a...,one of the best game music soundtracks for a g...
2,Batteries died within a year ...: I bought thi...,batteries died within a year i bought this cha...
3,"works fine, but Maha Energy is better: Check o...",works fine but maha energy is better check out...
4,Great for the non-audiophile: Reviewed quite a...,great for the nonaudiophile reviewed quite a b...
5,DVD Player crapped out after one year: I also ...,dvd player crapped out after one year i also b...
6,"Incorrect Disc: I love the style of this, but ...",incorrect disc i love the style of this but af...
7,DVD menu select problems: I cannot scroll thro...,dvd menu select problems i cannot scroll throu...
8,Unique Weird Orientalia from the 1930's: Exoti...,unique weird orientalia from the 1930s exotic ...
9,"Not an ""ultimate guide"": Firstly,I enjoyed the...",not an ultimate guide firstlyi enjoyed the for...


## # Tokenization + remove stopwords + lemmatization

In [32]:
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [35]:
def tokenize_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop and len(t) > 2]
    lem = [lemmatizer.lemmatize(t) for t in tokens]
    return lem

In [39]:
tqdm.pandas()
df['tokens'] = df['clean_text'].progress_apply(tokenize_and_lemmatize)

100%|██████████| 400000/400000 [03:41<00:00, 1803.57it/s]


## Split the data


In [42]:
X = df['tokens']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [47]:
print("X_train shape: " ,len(X_train))
print("X_test shape:", len(X_test))
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))

X_train shape:  320000
X_test shape: 80000
y_train shape: 320000
y_test shape: 80000


In [48]:
print(X_train.head())

242245    [mediocre, fake, velvet, touch, enhances, chea...
288918    [downton, abbey, love, love, love, would, reco...
105103    [rose, red, disappointed, one, stretch, get, e...
63504     [awful, experience, microsoft, mouse, stopped,...
239180    [must, seen, different, movie, one, word, movi...
Name: tokens, dtype: object


## TF - IDF

In [50]:
#  Prepare text for TF-IDF by joining tokens back into strings

X_train_joined = X_train.apply(lambda x: ' '.join(x))
X_test_joined = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorizer (fit on train only)

tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train_joined)
X_test_tfidf = tfidf.transform(X_test_joined)


print("TF-IDF shape train:", X_train_tfidf.shape)
print("TF-IDF shape test:", X_test_tfidf.shape)

TF-IDF shape train: (320000, 10000)
TF-IDF shape test: (80000, 10000)


In [41]:
print("TF-IDF shape:", X_tfidf.shape)

TF-IDF shape: (400000, 10000)
