In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [3]:
wv.similarity('Master', 'Anakin')

0.18861441

In [4]:
wv.most_similar(positive=['Darth_Vader','Jedi'], negative=['Sith'])

[('Yoda', 0.553029477596283),
 ('Darth', 0.5522363781929016),
 ('Obi_Wan_Kenobi', 0.5342901945114136),
 ('Sith_Lord', 0.5264482498168945),
 ('Jedi_Master', 0.5176643133163452),
 ('Han_Solo', 0.517648458480835),
 ('Luke_Skywalker', 0.5108938813209534),
 ('Hans_Solo', 0.5073797106742859),
 ('Darth_Vadar', 0.5068924427032471),
 ('Obi_Wan', 0.5052575469017029)]

# Movie Sentiment Analysis using Gensim for word embeddings

In [5]:
import pandas as pd
import spacy

In [6]:
df_imdb = pd.read_csv('../data/imdb/clean_imdb_dataset.csv')
df_imdb.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,sentiment_,clean_rev
0,0,One of the other reviewers has mentioned that ...,positive,1,reviewer mention watch 1 oz episode hook right...
1,1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production realism come home ...
2,2,I thought this was a wonderful way to spend ti...,positive,1,think wonderful way spend time hot summer week...
3,3,Basically there's a family where a little boy ...,negative,0,basically family little boy jake think zombie ...
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...


In [7]:
df_imdb.dropna(inplace=True)

In [8]:
df_imdb.clean_rev[0]

'reviewer mention watch 1 oz episode hook right exactly happen me.i main appeal fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess episode see strike nasty surreal ready watch develop taste oz get accustom high level graphic violence violence injustice crooked guard sell nickel inmate kill order away mannered middle class inmate turn prison bitch lack street skill prison experience watch oz comfortable uncomfortable view that touch dark'

In [9]:
nlp = spacy.load("en_core_web_lg")

def gensim_vectorize(text):
    doc = nlp(text)
    text_pre = [token.lemma_ for token in doc]
    return wv.get_mean_vector(text_pre)
df_imdb['vector'] = df_imdb['clean_rev'].apply(gensim_vectorize)
df_imdb

Unnamed: 0.1,Unnamed: 0,review,sentiment,sentiment_,clean_rev,vector
0,0,One of the other reviewers has mentioned that ...,positive,1,reviewer mention watch 1 oz episode hook right...,"[0.014170552, 0.0034130015, 0.0020986407, 0.03..."
1,1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production realism come home ...,"[0.03674541, 0.024681434, 0.0028873654, 0.0313..."
2,2,I thought this was a wonderful way to spend ti...,positive,1,think wonderful way spend time hot summer week...,"[0.024172936, 0.019199688, 0.003957791, 0.0389..."
3,3,Basically there's a family where a little boy ...,negative,0,basically family little boy jake think zombie ...,"[0.01918685, -0.007440765, -0.002525709, 0.040..."
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...,"[0.01805561, 0.010634208, 0.0034889076, 0.0351..."
...,...,...,...,...,...,...
49995,49995,I thought this movie did a down right good job...,positive,1,think movie right good job creative original e...,"[0.025032282, 0.015866827, 0.0063926186, 0.072..."
49996,49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...,"[0.021711292, 0.011525045, 0.012290539, 0.0490..."
49997,49997,I am a Catholic taught in parochial elementary...,negative,0,catholic teach parochial elementary school nun...,"[0.03212715, 0.003841062, 0.030124124, 0.04505..."
49998,49998,I'm going to have to disagree with the previou...,negative,0,go disagree previous comment maltin second rat...,"[0.028778354, 0.020742947, 0.0009850833, 0.028..."


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # as the algorithm does not accept negative values, I will use a standard scaler 

X_train, X_test, y_train, y_test = train_test_split(
    df_imdb.vector, df_imdb.sentiment_, 
    test_size=0.33, random_state=42, stratify= df_imdb.sentiment_)

In [20]:
import numpy as np 

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

X_train = scaler.fit_transform(X_train_2d)
X_test = scaler.transform(X_test_2d)

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


clf = MultinomialNB()
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75      8247
           1       0.75      0.76      0.75      8248

    accuracy                           0.75     16495
   macro avg       0.75      0.75      0.75     16495
weighted avg       0.75      0.75      0.75     16495



In [22]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.80      0.79      0.79      8247
           1       0.79      0.80      0.80      8248

    accuracy                           0.79     16495
   macro avg       0.79      0.79      0.79     16495
weighted avg       0.79      0.79      0.79     16495

