In [None]:
import re
import string

import numpy as np
import pandas as pd
import sklearn as sk

from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_non_alphanum, strip_numeric
from gensim.test.utils import get_tmpfile
import gensim.downloader as api

from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler, MaxAbsScaler
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt

# Preprocessing

In [None]:
df = pd.read_csv("imdb.csv")
labels = LabelEncoder().fit_transform(df['sentiment'])
df['review']

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

def preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('br', '', text)
    text = re.sub('\\b\w\\b', '', text)
    return text


#documents = df['review'].apply(preprocessor).apply(remove_stopwords)
documents = df['review'].apply(preprocessor)
documents

In [None]:
from nltk.tokenize import word_tokenize
tokens = documents.apply(word_tokenize)
tokens

# EDA

In [None]:
raw_tokens = df['review'].apply(word_tokenize)
unique_words_raw = {token for doc in raw_tokens for token in doc}
len(unique_words_raw)

In [None]:
unique_words = {token for doc in tokens for token in doc}
len(unique_words)

# FastText pretrained

In [None]:
wv = KeyedVectors.load_word2vec_format("gensim-data/wiki-news-300d-1M.vec", binary=False)
wv

In [None]:
wv.most_similar("cat")

In [None]:
# r_vecs = np.array([wv[word] for word in r if word in wv.vocab])

# List of strings -> ndarray
def embed(doc):
    sum = np.zeros(300)
    for word in doc:
        if word in wv.vocab:
            sum += wv.word_vec(word, use_norm=False)
    return sum/len(doc)

In [None]:
# List of strings -> ndarray
def embed_norm(doc):
    sum = np.zeros(300)
    for word in doc:
        if word in wv.vocab:
            sum += wv.word_vec(word, use_norm=True)
    return sum/len(doc)

In [None]:
# Series of lists of strings ->  Series of ndarrays
features = tokens.apply(embed)
# features_norm = tokens.apply(embed_norm)

# Series of ndarrays -> DataFrame
features = pd.DataFrame(features.values.tolist(), index=features.index)
# features_norm = pd.DataFrame(features_norm.values.tolist(), index=features_norm.index)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, test_size=0.5)

lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, test_size=0.5)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, test_size=0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, test_size=0.3)

normalizer = Normalizer()
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.fit_transform(X_test)

lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

# Doc2Vec

In [None]:
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens.tolist())]

model = Doc2Vec(vector_size=300, window=3, min_count=3, workers=-1)
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=40)

words = model.wv
docvecs = model.docvecs
del model

print(f"{len(words.vocab)=}")

In [None]:
words.similar_by_word("cat")

In [None]:
words.evaluate_word_pairs("gensim-data/wordsim_similarity_goldstandard.txt")

In [None]:
df_docvecs = pd.DataFrame(docvecs.vectors_docs)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_docvecs, labels, random_state=0, test_size=0.5)

lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_docvecs, labels, random_state=0, test_size=0.5)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

# Bag-of-words

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,1), max_features=100000)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=100000)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=100000)),
    ('scaler', MaxAbsScaler()),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('scaler', MaxAbsScaler()),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
print(len(pipeline['vectorizer'].vocabulary_))
pipeline.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(2,3), max_features=100000)),
    ('scaler', MaxAbsScaler()),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
print(len(pipeline['vectorizer'].vocabulary_))
pipeline.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline_hashed = Pipeline([
    ('vectorizer', HashingVectorizer(n_features=2**22)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline_hashed.fit(X_train, y_train)
pipeline_hashed.score(X_test, y_test)

In [None]:
def auc(model, X, y):
    y_pred = model.predict_proba(X)
    return roc_auc_score(y, y_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

scores = []
sizes = []

for n in range(16):
    v_size = 2**(n+1)
    sizes.append(v_size)

    pipeline_hashed = Pipeline([
        ('vectorizer', HashingVectorizer(n_features=v_size, ngram_range=(1,2))),
        ('classifier', LogisticRegression(solver='newton-cg'))
    ])

    pipeline_hashed.fit(X_train, y_train)
    scores.append(pipeline_hashed.score(X_test, y_test))

sn = pd.Series(data=scores, index=sizes)
sn.plot.bar(xlabel="Vocabulary size", ylabel="Accuracy")

In [None]:
fig = sn.plot.bar(xlabel="Vocabulary size", ylabel="Accuracy").get_figure()
fig.set_size_inches(18.5, 10.5)
plt.savefig("img/figure.pdf")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

scores = []
sizes = []

for n in (6000, 8000, 10000, 20000, 1730358):
    sizes.append(n)

    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(max_features=n, ngram_range=(1,2))),
        ('classifier', LogisticRegression(solver='newton-cg'))
    ])

    pipeline.fit(X_train, y_train)
    scores.append(pipeline.score(X_test, y_test))

sc = pd.Series(data=scores, index=sizes)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=100000)),
    ('scaler', MaxAbsScaler()),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
from sklearn.decomposition import TruncatedSVD

X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=60000)),
    ('scaler', MaxAbsScaler()),
    ('pca', TruncatedSVD(n_components=1000, n_iter=5, random_state=0)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=10000)),
    ('scaler', MaxAbsScaler()),
    ('classifier', MLPClassifier(solver='lbfgs',
                                 alpha=1e-5,
                                 hidden_layer_sizes=(100),
                                 random_state=0)
        )
    ])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)