In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import bz2
import re

pd.set_option("display.max_columns", None)

In [2]:
# Read file with bz2
y = []
x = []

count = 0
for line in bz2.BZ2File('Data/train.ft.txt.bz2'):
    tmp = line.decode('utf-8')
    y.append(int(tmp[9]) - 1)
    x.append(tmp[11:].strip())

df = pd.DataFrame({'Label': y, 'Review': x})

print(f'{df.shape = }')
display(df.head())

df.shape = (3600000, 2)


Unnamed: 0,Label,Review
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


## Preprocessing

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# Text cleaning:
# Change all text to lowercase
# Removing all non letters (ponctuation, numbers...)
df['Review'] = df['Review'].apply(lambda x: re.compile(r"[^a-z\s]").sub(r" ", x.lower()))

In [5]:
# Splitting dataset into train and test.
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Label'], test_size=0.2, random_state=1)

print(f'{X_train.shape = }, {X_test.shape = }')

X_train.shape = (2880000,), X_test.shape = (720000,)


In [6]:
# CountVectorizer with Logistic Regression
vectorizer = CountVectorizer()
X_train_CV = vectorizer.fit_transform(X_train)
X_test_CV = vectorizer.transform(X_test)

print(f'{X_train_CV.shape = }, {X_test_CV.shape = }')

X_train_CV.shape = (2880000, 763487), X_test_CV.shape = (720000, 763487)


In [7]:
model = LogisticRegression(solver='lbfgs', max_iter=100)
model = model.fit(X_train_CV, y_train)

predictions = model.predict(X_test_CV)

print(accuracy_score(y_test, list(predictions)))

0.904675


In [10]:
# HashingVectorizer with Logistic Regression
vectorizer = HashingVectorizer()
X_train_HV = vectorizer.fit_transform(X_train)
X_test_HV = vectorizer.transform(X_test)

print(f'{X_train_HV.shape = }, {X_test_HV.shape = }')

X_train_HV.shape = (2880000, 1048576), X_test_HV.shape = (720000, 1048576)


In [11]:
model = LogisticRegression(solver='lbfgs', max_iter=100)
model = model.fit(X_train_HV, y_train)

predictions = model.predict(X_test_HV)

print(accuracy_score(y_test, list(predictions)))

0.9024541666666667


In [8]:
# Tfidf with Logistic Regression
tfidf = TfidfVectorizer()
X_train_TF = tfidf.fit_transform(X_train)
X_test_TF = tfidf.transform(X_test)

print(f'{X_train_TF.shape = }, {X_test_TF.shape = }')

X_train_TF.shape = (2880000, 763487), X_test_TF.shape = (720000, 763487)


In [9]:
model = LogisticRegression(solver='lbfgs', max_iter=100)
model = model.fit(X_train_CV, y_train)

predictions = model.predict(X_test_CV)

print(accuracy_score(y_test, list(predictions)))

0.904675
