### Data installation

In [1]:
import os
import pandas as pd
from sklearn.utils import shuffle

def add_reviews(directory):
    reviews = []
    with os.scandir(directory) as files:
        for file in files:
            with open(file, 'r', encoding='utf8') as f:
                reviews.append(f.read())
    
    return reviews

train_pos = pd.DataFrame({'review' : add_reviews('./train/pos'), 'target' : 1})
train_neg = pd.DataFrame({'review' : add_reviews('./train/neg'), 'target' : 0})

test_pos = pd.DataFrame({'review' : add_reviews('./test/pos'), 'target' : 1})
test_neg = pd.DataFrame({'review' : add_reviews('./test/neg'), 'target' : 0})

df = shuffle(pd.concat([train_pos, train_neg, test_pos, test_neg]))

df = df.iloc[:10000]

### Preprocessing

In [3]:
import string 
import re
from functools import reduce
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def compose(*functions):
      
    def comp(f, g):
        return lambda x : f(g(x))
              
    return reduce(comp, functions)

def remove_punctuation(text):
    return "".join([" " if char in string.punctuation else char for char in text])

def remove_numbers(text):
    return "".join([" " if char.isdigit() else char for char in text])

def remove_multiple_spaces(text):
    return re.sub(r"\s+", " ", text)

def lower(text):
    return text.lower()

def remove_stopwords(text):
    removewords = stopwords.words("english")
    return " ".join(["" if word in removewords else word for word in text.split()])

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

preprocess_text = compose(lower, lemmatize, remove_stopwords, remove_multiple_spaces, remove_numbers, remove_punctuation)

#### Text before preprocessing

In [236]:
text = df.iloc[0]['review']
print(text)

While some performances were good-Victoria Rowell, Adrienne Barbeau, and the two Italian girlfriends come to mind-the story was lame and derivative, the emphasis on the girlfriend's racial background was handled clumsily at best, and the relatives were mostly portrayed as stereotypes, not as real people. I found myself wincing uncomfortably at many moments that were supposed to be funny. I can hardly comprehend why the local paper here in SF said this was a good movie, and wonder WHO posted the glowing review here on IMDb. Very disappointed in this movie, and mad I actually went to a theatre to see it, based on the faulty connection to Garden State, which is a far funnier, more inventive, and touching movie than this one. I must especially mention the emotional climax in the church, which was so wooden and by-the-numbers that I nearly left, and some in the audience actually DID. THAT was followed by a silly climax at the graveyard, which I saw coming 10 minutes before it happened. I re

#### Text after preprocessing

In [237]:
prep_text = preprocess_text(text)
print(prep_text)

while performance good victoria rowell adrienne barbeau two italian girlfriend come mind story lame derivative emphasis girlfriend racial background handled clumsily best relative mostly portrayed stereotype real people i found wincing uncomfortably many moment supposed funny i hardly comprehend local paper sf said good movie wonder who posted glowing review imdb very disappointed movie mad i actually went theatre see based faulty connection garden state far funnier inventive touching movie one i must especially mention emotional climax church wooden number i nearly left audience actually did that followed silly climax graveyard i saw coming minute happened i really like misled spend money uselessly


In [4]:
df['preprocessed_review'] = df['review'].apply(preprocess_text)

In [5]:
from sklearn.model_selection import train_test_split

df_train, df_test, target_train, target_test = train_test_split(df['preprocessed_review'], df['target'], test_size=0.2)

In [6]:
train_w2v = df_train.apply(lambda x : x.split())
test_w2v = df_test.apply(lambda x : x.split())

### Vectorization

#### Word2vec


#### Transformation of words to vectors

In [7]:
from gensim.models import Word2Vec

#CBOW
word2vec = Word2Vec(vector_size=100, window=5, min_count=1, sg=0)
word2vec.build_vocab(train_w2v)
word2vec.train(train_w2v, total_examples=word2vec.corpus_count, epochs=10)

(9643222, 10628170)

#### Generation of aggregated sentence vectors

In [242]:
import numpy as np

voc = set(word2vec.wv.index_to_key)
#changing each word in sentence to vector and applying elementwise mean to get same length 
X_train_w2v = pd.DataFrame(train_w2v.apply(lambda x : np.array([word2vec.wv[i] for i in x if i in voc]).mean(axis=0)).to_list())
X_test_w2v = pd.DataFrame(test_w2v.apply(lambda x : np.array([word2vec.wv[i] for i in x if i in voc]).mean(axis=0)).to_list())

#### TF-IDF

In [243]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tf_idf = vectorizer.fit_transform(df_train)
X_test_tf_idf = vectorizer.transform(df_test)

### Classification

#### Word2Vec

#### SVM

In [244]:
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

baseline_svm_w2v = svm.SVC()
baseline_svm_w2v.fit(X_train_w2v, target_train)

SVC()

In [259]:
def show_metrics(y_pred, y_true=target_test):
    print(f"""
    Accuracy = {round(accuracy_score(y_true, y_pred), 2)} 
    ROC-AUC = {round(roc_auc_score(y_true, y_pred), 2)} 
    f1_score = {round(f1_score(y_true, y_pred), 2)} 
    """)
    

In [246]:
y_pred_svm_w2v = baseline_svm_w2v.predict(X_test_w2v)

show_metrics(y_pred_svm_w2v)


    Accuracy = 0.83 
    ROC-AUC = 0.83 
    f1_score = 0.83 
    


#### Hyperparameters selection for SVM

In [270]:
from sklearn.model_selection import GridSearchCV

params_svm = {
    'C' : [0.5, 0.15, 1.0],
    'kernel' : ('poly', 'rbf', 'linear'),
}

svm_with_search_w2v = GridSearchCV(svm.SVC(), params_svm)
svm_with_search_w2v.fit(X_train_w2v, target_train)
svm_with_search_w2v.best_params_

{'C': 1.0, 'kernel': 'rbf'}

In [271]:
best_svm_w2v = svm.SVC(**svm_with_search_w2v.best_params_)
best_svm_w2v.fit(X_train_w2v, target_train)
y_pred_svm_best_w2v = best_svm_w2v.predict(X_test_w2v)

show_metrics(y_pred_svm_best_w2v)


    Accuracy = 0.83 
    ROC-AUC = 0.83 
    f1_score = 0.83 
    


#### Random forest

In [251]:
from sklearn.ensemble import RandomForestClassifier

baseline_rf_w2v = RandomForestClassifier()
baseline_rf_w2v.fit(X_train_w2v, target_train)
y_pred_rf_w2v = baseline_rf_w2v.predict(X_test_w2v)

In [253]:
show_metrics(y_pred_rf_w2v)


    Accuracy = 0.81 
    ROC-AUC = 0.81 
    f1_score = 0.81 
    


#### Hyperparameters selection for Random forest

In [254]:
params_rf = {
    'n_estimators' : [2, 15, 50, 100],
    'max_depth': [None, 2, 8],
}

rf_with_search_w2v = GridSearchCV(RandomForestClassifier(random_state=42), params_rf)
rf_with_search_w2v.fit(X_train_w2v, target_train)
rf_with_search_w2v.best_params_

{'max_depth': None, 'n_estimators': 100}

In [256]:
best_rf_w2v = RandomForestClassifier(**rf_with_search_w2v.best_params_, random_state=42)
best_rf_w2v.fit(X_train_w2v, target_train)
y_pred_rf_best_w2v = best_rf_w2v.predict(X_test_w2v)

In [260]:
show_metrics(y_pred_rf_best_w2v)


    Accuracy = 0.81 
    ROC-AUC = 0.81 
    f1_score = 0.81 
    


#### TF-IDF

#### SVM

In [261]:
baseline_svm_tf_idf = svm.SVC()
baseline_svm_tf_idf.fit(X_train_tf_idf, target_train)

y_pred_svm_tf_idf = baseline_svm_tf_idf.predict(X_test_tf_idf)

show_metrics(y_pred_svm_tf_idf)


    Accuracy = 0.88 
    ROC-AUC = 0.88 
    f1_score = 0.88 
    


#### Hyperparameters selection for SVM

In [272]:
svm_with_search_tf_idf = GridSearchCV(svm.SVC(), params_svm)
svm_with_search_tf_idf.fit(X_train_tf_idf, target_train)
svm_with_search_tf_idf.best_params_

{'C': 1.0, 'kernel': 'rbf'}

In [273]:
best_svm_tf_idf = svm.SVC(**svm_with_search_tf_idf.best_params_)
best_svm_tf_idf.fit(X_train_tf_idf, target_train)
y_pred_svm_best_tf_idf = best_svm_tf_idf.predict(X_test_tf_idf)

show_metrics(y_pred_svm_best_tf_idf)


    Accuracy = 0.88 
    ROC-AUC = 0.88 
    f1_score = 0.88 
    


#### Random forest

In [265]:
baseline_rf_tf_idf = RandomForestClassifier()
baseline_rf_tf_idf.fit(X_train_tf_idf, target_train)
y_pred_rf_tf_idf = baseline_rf_tf_idf.predict(X_test_tf_idf)

In [266]:
show_metrics(y_pred_rf_tf_idf)


    Accuracy = 0.84 
    ROC-AUC = 0.84 
    f1_score = 0.84 
    


#### Hyperparameters selection for Random forest

In [267]:
rf_with_search_tf_idf = GridSearchCV(RandomForestClassifier(random_state=42), params_rf)
rf_with_search_tf_idf.fit(X_train_tf_idf, target_train)
rf_with_search_tf_idf.best_params_

{'max_depth': None, 'n_estimators': 100}

In [268]:
best_rf_tf_idf = RandomForestClassifier(**rf_with_search_tf_idf.best_params_, random_state=42)
best_rf_tf_idf.fit(X_train_tf_idf, target_train)
y_pred_rf_best_tf_idf = best_rf_tf_idf.predict(X_test_tf_idf)

In [269]:
show_metrics(y_pred_rf_best_tf_idf)


    Accuracy = 0.84 
    ROC-AUC = 0.84 
    f1_score = 0.84 
    
