# Import

In [16]:
import os
import csv
import re
import gensim
import nltk
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

## Configuration

In [2]:
class Configurator:
    def __init__(self) -> None:
        self._params = {
            'path.dataset.original': './IMDB Dataset.csv',
            'path.dataset.prepared': './prepared IMDB Dataset.csv',
            'path.dataset.w2v-embedding': './word2vec-google-news-300.model',

            'train-test-split.test-size': 0.2,
            'train-test-split.random-stage': 42,

            'vec.tfidf.max-features': 300,
            'vec.tfidf.norm': None,
            'vec.tfidf.max-df': 0.95,
            'vec.tfidf.min-df': 5,
            'vec.tfidf.stop-words': 'english',

            'hyper-params.on': False,
            'hyper-params.cv': 3,
            'hyper-params.scoring': 'accuracy',
            'hyper-params.verbose': 3,
            'hyper-params.jobs': -1,
            'hyper-params.param-grid': {
                'max_depth': [3, None],
                'n_estimators': [10, 100, 200],
            }
        }
    
    def __call__(self, *args, **kwargs):
        if len(args) == 0 or args[0] not in self._params:
            return None
        return self._params[args[0]]

conf = Configurator()

## Utils

In [3]:
def sentiment_to_num(sentiment: str) -> int:
    s = sentiment.lower()
    if s == 'positive':
        return 1
    elif s == 'negative':
        return 0
    return -1

def num_to_sentiment(num: int) -> int:
    return 'positive' if num == 1 else 'negative'

## Files checking

In [23]:
if not os.path.isfile(conf('path.dataset.original')):
    print('Dataset is absent: ' + conf('path.dataset.original'))
    
embedding_path = conf('path.dataset.w2v-embedding')
if not os.path.isfile(embedding_path):
    print('Word2vec embedding is absent: ' + embedding_path + '; embedding source: https://huggingface.co/fse/word2vec-google-news-300/')

## Reset dataset

In [5]:
dataset = None
raw_dataset = None

## Load preapared dataset if it exists

In [6]:
try:
    with open(conf('path.dataset.prepared'), encoding='utf-8') as file:
        reader = csv.DictReader(file)
        dataset = [{'review': item['review'], 'sentiment': int(item['sentiment'])} for item in reader]
        print('Prepared dataset loaded')
except Exception as e:
    print(e)



Prepared dataset loaded


## Load and prepare dataset if prepared does not exist

In [7]:
if dataset is None:
    
    tag_filter = re.compile(r'</?[a-z][\w=" -]*/?>')
    punctuation_filter = re.compile(r'[.,!?*_)(]+')
    space_filter = re.compile(r'\s+')
    
    def filter_text(text: str) -> str:
        text = text.lower()
        text = tag_filter.sub(' ', text)
        text = punctuation_filter.sub(' ', text)
        text = space_filter.sub(' ', text)
        
        return text
    
    try:
        with open(conf('path.dataset.original'), encoding='utf-8') as file:
            reader = csv.DictReader(file)
            raw_dataset = [item for item in reader]
        dataset = []
        print('Dataset loaded')

        for datum in raw_dataset:
            num_sentiment = sentiment_to_num(datum['sentiment'])
            if num_sentiment != -1:
                dataset.append({
                    'review': filter_text(datum['review']),
                    'sentiment': num_sentiment
                })

        with open(conf('path.dataset.prepared'), 'w', encoding='utf-8') as file:
            fieldnames = ['review', 'sentiment']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            [writer.writerow(datum) for datum in dataset]
        print('Dataset prepared')
        print('Prepared dataset saved')
    except Exception as e:
        print(e)
else:
    print('Prepared dataset exists')

Prepared dataset exists


## Split data

In [8]:
reviews = []
sentiments = []
str_sentiments = []
for datum in dataset:
    reviews.append(datum['review'])
    sentiments.append(datum['sentiment'])
    str_sentiments.append(num_to_sentiment(datum['sentiment']))


train_x, test_x, train_y, test_y = train_test_split(
    reviews,
    sentiments,
    test_size=conf('train-test-split.test-size'),
    random_state=conf('train-test-split.random-stage'),
    stratify=str_sentiments
)

## TF-IDF vectorizing

In [9]:
tf_idf_vectorizer = TfidfVectorizer(
    max_features=conf('vec.tfidf.max-features'),
    norm=conf('vec.tfidf.norm'),
    max_df=conf('vec.tfidf.max-df'),
    min_df=conf('vec.tfidf.min-df'),
    stop_words=conf('vec.tfidf.stop-words')
)
tf_idf_train_X = tf_idf_vectorizer.fit_transform(train_x)
tf_idf_test_X = tf_idf_vectorizer.transform(test_x)

## Find optimal hyperparams with Gridsearch for TF-IDF vectors

In [10]:
tf_idf_clf = RandomForestClassifier()
tf_idf_grid = GridSearchCV(
    tf_idf_clf,
    param_grid=conf('hyper-params.param-grid'),
    cv=conf('hyper-params.cv'),
    scoring=conf('hyper-params.scoring'),
    verbose=conf('hyper-params.verbose'),
    n_jobs=conf('hyper-params.jobs')
)
tf_idf_model_grid = tf_idf_grid.fit(tf_idf_train_X, train_y)
print('Best hyperparameters are ' + str(tf_idf_model_grid.best_params_))
print('Best score is ' + str(tf_idf_model_grid.best_score_))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best hyperparameters are {'max_depth': None, 'n_estimators': 200}
Best score is 0.7978500084971002


## Prediction for TF-IDF vectors

In [14]:
tf_idf_clf.fit(tf_idf_train_X, train_y)
prediction = tf_idf_clf.predict(tf_idf_test_X)

print('Accuracy: ' + str(accuracy_score(test_y, prediction)))
print('F1: ' +  str(f1_score(test_y, prediction, average="macro")))

Accuracy: 0.7981
F1: 0.7980527261626673


## word2Vec + TF-IDF vectorizing

In [38]:
keyed_vectors = gensim.models.KeyedVectors.load(conf('path.dataset.w2v-embedding'))
# x = gensim.models.Word2Vec.load(conf('path.dataset.w2v-embedding'))
# print(type(keyed_vectors))

class Vectors:
    def __init__(self, wv):
        self._wv = wv

    def get(self, token: str):
        return self._wv.vectors[self._wv.key_to_index[token]] if token in self._wv.key_to_index else None
    
vectors = Vectors(keyed_vectors)

def vectorize(sentence: str):
    weighs_data = tf_idf_vectorizer.transform([sentence]).tocoo()
    vocab = tf_idf_vectorizer.get_feature_names_out()

    sentence_vector = []
    for row, col, weight in zip(weighs_data.row, weighs_data.col, weighs_data.data):
        token = vectors.get(vocab[col])
        if token is not None:
            sentence_vector.append(weight * token)
    
    if len(sentence_vector) == 0:
        return None
    return np.mean(sentence_vector, axis=0)

w2v_train_X = []
for text in train_x:
    v = vectorize(text)
    if v is not None:
        w2v_train_X.append(v)

w2v_test_X = []
for text in test_x:
    v = vectorize(text)
    if v is not None:
        w2v_test_X.append(v)


print(w2v_train_X[0])
print(w2v_test_X[0])
print(len(w2v_train_X))
print(len(w2v_test_X))

# x = """one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare forget pretty pictures painted for mainstream audiences forget charm forget romance oz doesn't mess around the first episode i ever saw struck me as so nasty it was surreal i couldn't say i was ready for it but as i watched more i developed a taste for oz and got accustomed to the high levels of graphic violence not just violence but injustice crooked guards who'll be sold out for a nickel inmates who'll kill on order and get away with it well mannered middle class inmates being turned into prison bitches due to their lack of street skills or prison experience watching oz you may become comfortable with what is uncomfortable viewing thats if you can get in touch with your darker side"""
# v = vectorize(x)


# tf_idf_train_X = tf_idf_vectorizer.fit_transform(train_x)
# tf_idf_test_X = tf_idf_vectorizer.transform(test_x)


# def tfidf_embedding(vectors: Vectors, sentence: str, vectorizer):
#     weighs_data1 = vectorizer.transform([sentence])
#     weighs_data = vectorizer.transform([sentence]).tocoo()

#     vocab = vectorizer.get_feature_names_out()

#     sentence_vector = []
#     for row, col, weight in zip(weighs_data.row, weighs_data.col, weighs_data.data):
#         print(row, col, weight)
#         print(vocab[col])
#         token = vectors.get(vocab[col])
#         if token is not None:
#             sentence_vector.append(weight * token)
#     v = np.mean(sentence_vector, axis=0)
#     print(v.shape)
#     print(v)

#     pass



[ 0.34937796  0.17211932  0.14341433  0.41276607 -0.08646991 -0.13028307
  0.21137387 -0.25993258  0.32833505  0.24431635 -0.1404701  -0.39253506
 -0.2141691   0.08864664 -0.42881262  0.4092807   0.32810846  0.75480205
 -0.03262123 -0.29178265  0.0275266   0.4046886   0.22331345  0.1317039
  0.36825624 -0.4561386  -0.2252761   0.40394044  0.27249417 -0.05284693
 -0.5433183   0.09425654 -0.24595608  0.22773342  0.39467254 -0.14111935
  0.34730637  0.26775655  0.10978246  0.3793744   0.36419073 -0.39584658
  0.53749305  0.1331056  -0.02636757 -0.00756086 -0.2753962  -0.19376205
  0.17768179  0.19817871 -0.3671048   0.09754681 -0.118329    0.09711701
  0.16718929  0.00847303  0.11380235 -0.2665592   0.17597872 -0.2612956
 -0.15350886  0.31791896 -0.30344462 -0.17439282 -0.16186476  0.13511072
 -0.26034293  0.22596313 -0.02519921  0.22305809  0.08243472  0.09376258
  0.24954689  0.02308584 -0.6920633  -0.02220535  0.10747418  0.35778365
  0.3853655   0.38646713  0.05309062 -0.01224812  0.1