# Import

In [43]:
import os
import csv
import re
import gensim
import nltk
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from IPython.display import HTML

## Configuration

In [44]:
class Configurator:
    def __init__(self) -> None:
        self._params = {
            'path.dataset.original': './IMDB Dataset.csv',
            'path.dataset.prepared': './prepared IMDB Dataset.csv',
            'path.dataset.w2v-embedding': './word2vec-google-news-300.model',

            'train-test-split.test-size': 0.2,
            'train-test-split.random-stage': 42,

            'vec.tfidf.max-features': 300,
            'vec.tfidf.norm': None,
            'vec.tfidf.max-df': 0.95,
            'vec.tfidf.min-df': 5,
            'vec.tfidf.stop-words': 'english',

            'hyper-params.on': False,
            'hyper-params.cv': 3,
            'hyper-params.scoring': 'accuracy',
            'hyper-params.verbose': 3,
            'hyper-params.jobs': -1,
            'hyper-params.param-grid': {
                'max_depth': [1, 3, None],
                'n_estimators': [100, 200, 300]    
            }
#             'hyper-params.param-grid': {
#                 'max_depth': [1],
#                 'n_estimators': [100],
#             }
        }
    
    def __call__(self, *args, **kwargs):
        if len(args) == 0 or args[0] not in self._params:
            return None
        return self._params[args[0]]

conf = Configurator()

## Utils

In [45]:
def sentiment_to_num(sentiment: str) -> int:
    s = sentiment.lower()
    if s == 'positive':
        return 1
    elif s == 'negative':
        return 0
    return -1

def num_to_sentiment(num: int) -> int:
    return 'positive' if num == 1 else 'negative'

## Files checking

In [46]:
if not os.path.isfile(conf('path.dataset.original')):
    print('Dataset is absent: ' + conf('path.dataset.original'))
    
embedding_path = conf('path.dataset.w2v-embedding')
if not os.path.isfile(embedding_path):
    print('Word2vec embedding is absent: ' + embedding_path + '; embedding source: https://huggingface.co/fse/word2vec-google-news-300/')

## Reset dataset

In [47]:
dataset = None
raw_dataset = None

## Load preapared dataset if it exists

In [48]:
try:
    with open(conf('path.dataset.prepared'), encoding='utf-8') as file:
        reader = csv.DictReader(file)
        dataset = [{'review': item['review'], 'sentiment': int(item['sentiment'])} for item in reader]
        print('Prepared dataset loaded')
except Exception as e:
    print(e)



Prepared dataset loaded


## Load and prepare dataset if prepared does not exist

In [49]:
if dataset is None:
    
    tag_filter = re.compile(r'</?[a-z][\w=" -]*/?>')
    punctuation_filter = re.compile(r'[.,!?*_)(]+')
    space_filter = re.compile(r'\s+')
    
    def filter_text(text: str) -> str:
        text = text.lower()
        text = tag_filter.sub(' ', text)
        text = punctuation_filter.sub(' ', text)
        text = space_filter.sub(' ', text)
        
        return text
    
    try:
        with open(conf('path.dataset.original'), encoding='utf-8') as file:
            reader = csv.DictReader(file)
            raw_dataset = [item for item in reader]
        dataset = []
        print('Dataset loaded')

        for datum in raw_dataset:
            num_sentiment = sentiment_to_num(datum['sentiment'])
            if num_sentiment != -1:
                dataset.append({
                    'review': filter_text(datum['review']),
                    'sentiment': num_sentiment
                })

        with open(conf('path.dataset.prepared'), 'w', encoding='utf-8') as file:
            fieldnames = ['review', 'sentiment']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            [writer.writerow(datum) for datum in dataset]
        print('Dataset prepared')
        print('Prepared dataset saved')
    except Exception as e:
        print(e)
else:
    print('Prepared dataset exists')

Prepared dataset exists


## Split data

In [50]:
reviews = []
sentiments = []
str_sentiments = []
for datum in dataset:
    reviews.append(datum['review'])
    sentiments.append(datum['sentiment'])
    str_sentiments.append(num_to_sentiment(datum['sentiment']))


train_x, test_x, train_y, test_y = train_test_split(
    reviews,
    sentiments,
    test_size=conf('train-test-split.test-size'),
    random_state=conf('train-test-split.random-stage'),
    stratify=str_sentiments
)

## TF-IDF vectorizing

In [51]:
tf_idf_vectorizer = TfidfVectorizer(
    max_features=conf('vec.tfidf.max-features'),
    norm=conf('vec.tfidf.norm'),
    max_df=conf('vec.tfidf.max-df'),
    min_df=conf('vec.tfidf.min-df'),
    stop_words=conf('vec.tfidf.stop-words')
)
tf_idf_train_X = tf_idf_vectorizer.fit_transform(train_x)
tf_idf_test_X = tf_idf_vectorizer.transform(test_x)
print('Vectorizing is done')

Vectorizing is done


## Find optimal hyperparams with Gridsearch for TF-IDF vectors

In [52]:
tf_idf_clf = RandomForestClassifier()
tf_idf_grid = GridSearchCV(
    tf_idf_clf,
    param_grid=conf('hyper-params.param-grid'),
    cv=conf('hyper-params.cv'),
    scoring=conf('hyper-params.scoring'),
    verbose=conf('hyper-params.verbose'),
    n_jobs=conf('hyper-params.jobs')
)
tf_idf_model_grid = tf_idf_grid.fit(tf_idf_train_X, train_y)
print('Best hyperparameters are ' + str(tf_idf_model_grid.best_params_))
print('Best score is ' + str(tf_idf_model_grid.best_score_))

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best hyperparameters are {'max_depth': None, 'n_estimators': 300}
Best score is 0.7996250341236782


## Prediction for TF-IDF vectors

In [53]:
tf_idf_clf.fit(tf_idf_train_X, train_y)
prediction = tf_idf_clf.predict(tf_idf_test_X)

tf_idf_accuracy_socre = accuracy_score(test_y, prediction)
tf_idf_f1 = f1_score(test_y, prediction, average="macro")
print('Accuracy: ' + str(tf_idf_accuracy_socre))
print('F1: ' +  str(tf_idf_f1))

Accuracy: 0.7992
F1: 0.7991391873803713


## word2Vec + TF-IDF vectorizing

In [54]:
keyed_vectors = gensim.models.KeyedVectors.load(conf('path.dataset.w2v-embedding'))

class Vectors:
    def __init__(self, wv):
        self._wv = wv

    def get(self, token: str):
        return self._wv.vectors[self._wv.key_to_index[token]] if token in self._wv.key_to_index else None
    
vectors = Vectors(keyed_vectors)

def vectorize(sentence: str):
    weighs_data = tf_idf_vectorizer.transform([sentence]).tocoo()
    vocab = tf_idf_vectorizer.get_feature_names_out()

    sentence_vector = []
    for row, col, weight in zip(weighs_data.row, weighs_data.col, weighs_data.data):
        token = vectors.get(vocab[col])
        if token is not None:
            sentence_vector.append(weight * token)
    
    if len(sentence_vector) == 0:
        return None
    return np.mean(sentence_vector, axis=0)

def texts_to_vectors(texts: list[str], y: list[int]) -> tuple:
    X = []
    Y = []
    for index, text in enumerate(texts):
        v = vectorize(text)
        if v is not None:
            X.append(v)
            Y.append(y[index])
    return X, Y, 

w2v_train_X, w2v_train_y = texts_to_vectors(train_x, train_y)
w2v_test_X, w2v_test_y = texts_to_vectors(test_x, test_y)

print('Vectorizing is done')

Vectorizing is done


## Find optimal hyperparams with Gridsearch for TF-IDF + Word2vec vectors

In [55]:
w2v_clf = RandomForestClassifier()
w2v_model_grid = GridSearchCV(
    w2v_clf,
    param_grid=conf('hyper-params.param-grid'),
    cv=conf('hyper-params.cv'),
    scoring=conf('hyper-params.scoring'),
    verbose=conf('hyper-params.verbose'),
    n_jobs=conf('hyper-params.jobs')
)
w2v_model_grid = w2v_model_grid.fit(w2v_train_X, w2v_train_y)
print('Best hyperparameters are ' + str(w2v_model_grid.best_params_))
print('Best score is ' + str(w2v_model_grid.best_score_))

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best hyperparameters are {'max_depth': None, 'n_estimators': 200}
Best score is 0.7724579309352221


## Prediction for TF-IDF + Word2vec vectors

In [56]:
w2v_clf.fit(w2v_train_X, w2v_train_y)
prediction = w2v_clf.predict(w2v_test_X)

w2v_accuracy_socre = accuracy_score(w2v_test_y, prediction)
w2v_f1 = f1_score(w2v_test_y, prediction, average="macro")
print('Accuracy: ' + str(accuracy_score(w2v_test_y, prediction)))
print('F1: ' +  str(f1_score(w2v_test_y, prediction, average="macro")))

Accuracy: 0.7684
F1: 0.7683955161371925


## Summary

In [57]:
data = [['TF-IDF', tf_idf_accuracy_socre, w2v_accuracy_socre], ['TF-IDF+Word2vec', tf_idf_f1, w2v_f1]]
df = pd.DataFrame(data, columns=['Type', 'Accuracy', 'F1'])

HTML(df.to_html(index=False))

Type,Accuracy,F1
TF-IDF,0.7992,0.7684
TF-IDF+Word2vec,0.799139,0.768396
