# Sentiment Analysis with Word2Vec

We are going to work with the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).

Maas et al, (2011). "Learning Word Vectors for Sentiment Analysis"

This is a collection of user generated movie reviews, each review being labelled as POSITIVE or NEGATIVE.

# Download and Prepare Data

In [None]:
import requests

r = requests.get('https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')

assert r.status_code == 200

with open('imdb.tar.gz', 'wb') as out:
    out.write(r.content)

In [None]:
import tarfile
import re

from tqdm.notebook import tqdm

data = []
filename = re.compile(r'aclImdb/(?P<split>train|test)/(?P<label>neg|pos)/(?P<id>[0-9_]+)\.txt$')

with tarfile.open('imdb.tar.gz', 'r:gz') as tgz:
    for f in tqdm(tgz.getmembers()):
        m = filename.match(f.name)
        if f.isfile() and m is not None:
            data.append({
                'id': m['id'],
                'split': m['split'],
                'text': tgz.extractfile(f).read().decode('utf-8'),
                'label': m['label']
            })

HBox(children=(FloatProgress(value=0.0, max=100019.0), HTML(value='')))




In [None]:
import pandas as pd

df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,split,text,label
0,127_3,test,I love sci-fi and am willing to put up with a ...,neg
1,126_4,test,"Worth the entertainment value of a rental, esp...",neg
2,125_3,test,its a totally average film with a few semi-alr...,neg
3,124_2,test,STAR RATING: ***** Saturday Night **** Friday ...,neg
4,123_4,test,"First off let me say, If you haven't enjoyed a...",neg


In [None]:
train = df[df['split'] == 'train']

test = df[df['split'] == 'test']
X_test = test['text']
y_test = test['label']

# Word2Vec

We will use the Word2Vec pre-trained vectors provided by Google.

In [None]:
DIMS = 300

In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = train['text']
y_train = train['label']

tfidf = TfidfVectorizer(
    stop_words='english',
    min_df=2,
    max_df=0.9,
    max_features=20000
)

tfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=20000,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
import numpy as np

vocab = tfidf.get_feature_names()
word_vecs = np.zeros((len(vocab), DIMS))

for i, w in enumerate(vocab):
    try:
        word_vecs[i, :] = model[w]
    except KeyError:
        pass

In [None]:
X_train_bow = tfidf.transform(X_train)
X_train_vecs = X_train_bow.dot(word_vecs)

X_test_bow = tfidf.transform(X_test)
X_test_vecs = X_test_bow.dot(word_vecs)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1e4)
clf.fit(X_train_vecs, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000.0,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=clf.predict(X_test_vecs)))

              precision    recall  f1-score   support

         neg       0.84      0.84      0.84     12500
         pos       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

