In [42]:
import re
import string

import numpy as np
import pandas as pd
import sklearn as sk
# from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_non_alphanum, strip_numeric
from gensim.test.utils import get_tmpfile
import gensim.downloader as api

from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [26]:
df = pd.read_csv("imdb.csv")
labels = LabelEncoder().fit_transform(df['sentiment'])
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [34]:
from nltk.tokenize import word_tokenize


def preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text


documents = df['review'].apply(preprocessor)
documents

0        one of the other reviewers has mentioned that ...
1        a wonderful little production   br    br   the...
2        i thought this was a wonderful way to spend ti...
3        basically there s a family where a little boy ...
4        petter mattei s  love in the time of money  is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot  bad dialogue  bad acting  idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i m going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [28]:
tokens = documents.apply(word_tokenize)
tokens

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, br, br, the...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, there, s, a, family, where, a, lit...
4        [petter, mattei, s, love, in, the, time, of, m...
                               ...                        
49995    [i, thought, this, movie, did, a, down, right,...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, am, a, catholic, taught, in, parochial, el...
49998    [i, m, going, to, have, to, disagree, with, th...
49999    [no, one, expects, the, star, trek, movies, to...
Name: review, Length: 50000, dtype: object

# FastText pretrained

In [4]:
wv = KeyedVectors.load_word2vec_format("gensim-data/wiki-news-300d-1M.vec", binary=False)
wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f3db8114a60>

In [12]:
# r_vecs = np.array([wv[word] for word in r if word in wv.vocab])

# List of strings -> ndarray
def embed(doc):
    sum = np.zeros(300)
    for word in doc:
        if word in wv.vocab:
            sum += wv.word_vec(word, use_norm=False)
    return sum/len(doc)

In [13]:
# List of strings -> ndarray
def embed_norm(doc):
    sum = np.zeros(300)
    for word in doc:
        if word in wv.vocab:
            sum += wv.word_vec(word, use_norm=True)
    return sum/len(doc)

In [29]:
# Series of lists of strings ->  Series of ndarrays
features = tokens.apply(embed)
# features_norm = tokens.apply(embed_norm)

# Series of ndarrays -> DataFrame
features = pd.DataFrame(features.values.tolist(), index=features.index)
# features_norm = pd.DataFrame(features_norm.values.tolist(), index=features_norm.index)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, test_size=0.5)
# X_train, X_test, y_train, y_test = train_test_split(features_norm, labels, random_state=0, test_size=0.5)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_train.shape, y_train.shape

((25000, 300), (25000,))

In [55]:
# y_pred = lr.predict(X_test)
# accuracy_score(y_test, y_pred)

In [56]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.83716

In [57]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

0.861

In [52]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(features_norm, labels, random_state=0, test_size=0.5)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_train.shape, y_train.shape

((40000, 300), (40000,))

In [53]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

0.8581

# Doc2Vec

In [None]:
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens.tolist())]

model = Doc2Vec(vector_size=300, window=5, min_count=2, workers=-1)
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=20)

words = model.wv
docvecs = model.docvecs

print(f"{len(words.vocab)=}")

In [None]:
df_docvecs = pd.DataFrame(docvecs.vectors_docs)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_docvecs, labels, random_state=0, test_size=0.5)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_train.shape, y_train.shape

In [None]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

# Bag-of-words

In [36]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.5)
X_train.shape, y_train.shape

((25000,), (25000,))

In [48]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=100000)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
print(len(pipeline['vectorizer'].vocabulary_))
pipeline.score(X_test, y_test)

AttributeError: 'HashingVectorizer' object has no attribute 'vocabulary_'

In [40]:
pipeline_scaled = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=100000)),
    # ('scaler', StandardScaler()), # not working
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline_scaled.fit(X_train, y_train)
print(len(pipeline_scaled['vectorizer'].vocabulary_))
pipeline_scaled.score(X_test, y_test)

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.

In [None]:
pipeline_hashed = Pipeline([
    ('vectorizer', HashingVectorizer(n_features=2**20)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline_hashed.fit(X_train, y_train)
pipeline_hashed.score(X_test, y_test)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(documents, labels, random_state=0, test_size=0.2)
X_train.shape, y_train.shape

((40000,), (40000,))

In [51]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2), max_features=100000)),
    ('classifier', LogisticRegression(solver='newton-cg'))
    ])

pipeline.fit(X_train, y_train)
print(len(pipeline['vectorizer'].vocabulary_))
pipeline.score(X_test, y_test)

100000


0.9016