# Sentiment Analysis of Movie Reviews - Simple Bag of Words

In [9]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.multiprocessing import get
import utils

### Read Data

In [8]:
df = pd.read_csv('labeledTrainData.tsv', sep='\t')
df.info()
print(df.head())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


## Parallel Preprocessing with DASK

In [13]:
def series_preprocess(series):
    return series.apply(utils.text_preprocess)

In [14]:
ddata = dd.from_pandas(df.review, npartitions=20)
ddata

Dask Series Structure:
npartitions=20
0        object
1250        ...
          ...  
23750       ...
24999       ...
Name: review, dtype: object
Dask Name: from_pandas, 20 tasks

In [15]:
%%time
res = ddata.map_partitions(series_preprocess).compute(get=get)

CPU times: user 9.64 s, sys: 259 ms, total: 9.9 s
Wall time: 2min 55s


In [22]:
df['review_preprocessed_tokenized'] = res

In [24]:
df['review_preprocessed'] = df['review_preprocessed_tokenized'].apply(lambda x: ' '.join(x) )

In [25]:
df.head()

Unnamed: 0,id,sentiment,review,review_preprocessed,review_preprocessed_tokenized
0,5814_8,1,With all this stuff going down at the moment w...,stuff go moment mj start listen music watch od...,"[stuff, go, moment, mj, start, listen, music, ..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war world timothi hine veri entertain ...,"[classic, war, world, timothi, hine, veri, ent..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film start manag nichola bell give welcom inve...,"[film, start, manag, nichola, bell, give, welc..."
3,3630_4,0,It must be assumed that those who praised this...,must assum prais film greatest film opera ever...,"[must, assum, prais, film, greatest, film, ope..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbl trashi wondrous unpretenti 80 exploit ...,"[superbl, trashi, wondrous, unpretenti, 80, ex..."


In [26]:
df.to_pickle('reviews.pkl')

In [27]:
df = pd.read_pickle('reviews.pkl')

In [28]:
df.head()

Unnamed: 0,id,sentiment,review,review_preprocessed,review_preprocessed_tokenized
0,5814_8,1,With all this stuff going down at the moment w...,stuff go moment mj start listen music watch od...,"[stuff, go, moment, mj, start, listen, music, ..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war world timothi hine veri entertain ...,"[classic, war, world, timothi, hine, veri, ent..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film start manag nichola bell give welcom inve...,"[film, start, manag, nichola, bell, give, welc..."
3,3630_4,0,It must be assumed that those who praised this...,must assum prais film greatest film opera ever...,"[must, assum, prais, film, greatest, film, ope..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbl trashi wondrous unpretenti 80 exploit ...,"[superbl, trashi, wondrous, unpretenti, 80, ex..."


## Bag of Words

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(df['review_preprocessed'], df['sentiment'], test_size=0.2, random_state=42) 

In [49]:
count_vectorizer = CountVectorizer()

In [50]:
sparse_matrix = count_vectorizer.fit_transform(X_train)

In [51]:
sparse_matrix.get_shape()

(20000, 46690)

In [52]:
from scipy.sparse import csr_matrix, csc_matrix

In [53]:
csr_matrix.todense(sparse_matrix)

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [76]:
len(count_vectorizer.vocabulary_)

46690

In [60]:
X_train_split = X_train.apply(lambda x: x.split())

In [64]:
words, counts = utils.wordcount_corpus(X_train_split)

In [67]:
word_counts_df = pd.DataFrame({'word': words, 'count': counts}).sort_values(by='count', ascending=False)
word_counts_df.head()

Unnamed: 0,count,word
27726,41381,movi
14840,38650,film
29664,22221,one
24132,18247,like
41600,12817,time


In [139]:
simple_pipe = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

In [140]:
simple_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [141]:
simple_pipe.score(X_test, y_test)

0.8542

### TF-IDF (Term Frequency - Inverse Document Frequency)

Words appearing very often in the whole corpus do not carry a lot of information, e.g. the word Movie or Film in our corpus.

- numerical statistic
- should reflect how important a word in a document is by considering the whole corpus
- can be seen as a weighting factor
- increases by increasing wordcount of the word in the document
- offset by frequency of the word in the whole corpus
- most used term-weighting scheme


Product of two statistics:
\begin{align}
tf\text{-}idf(t,d) = tf(t,d) \times idf(t)
\end{align}

where tf(t,d) is the frequency of the term/word t in document d and

\begin{align}
idf(t) = \ln\left(\frac{n_d}{df(d,t)}\right)
\end{align}

is the logarithmically scaled inverse fraction of the documents that contain the word, obtained by dividing the total number of documents $n_d$ by the number of documents containing the term $df$, and then taking the logarithm of that quotient. Most often it is smoothed by adding a one in the denominator and add a final one, leading to:

\begin{align}
idf(t) = \ln\left(\frac{n_d}{df(d,t)}\right) + 1
\end{align}

Finally, the resulting vector of a weighted wordcount vector in normalized using L2 norm (normalized to length 1).

For more information see: http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

### Example

In [103]:
corpus = [
    "The brown brown brown fox",
    "The lazy dog",
    "The fox is brown",
    "black is a color",
]

corpus

['The brown brown brown fox',
 'The lazy dog',
 'The fox is brown',
 'black is a color']

In [104]:
sparse_matrix = count_vectorizer.fit_transform(corpus)

In [105]:
corpus_counts = csr_matrix.todense(sparse_matrix)
corpus_counts

matrix([[0, 3, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 1, 0, 0, 1, 1],
        [0, 1, 0, 0, 1, 1, 0, 1],
        [1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [106]:
count_vectorizer.get_feature_names()

['black', 'brown', 'color', 'dog', 'fox', 'is', 'lazy', 'the']

In [125]:
tfidf_vectorizer = TfidfVectorizer(smooth_idf=False)

In [126]:
sparse_matrix = tfidf_vectorizer.fit_transform(corpus)

In [127]:
corpus_weighted_counts = csr_matrix.todense(sparse_matrix)
corpus_weighted_counts

matrix([[0.        , 0.92238296, 0.        , 0.        , 0.30746099,
         0.        , 0.        , 0.23383201],
        [0.        , 0.        , 0.        , 0.66064766, 0.        ,
         0.        , 0.66064766, 0.3564959 ],
        [0.        , 0.52863461, 0.        , 0.        , 0.52863461,
         0.52863461, 0.        , 0.40204024],
        [0.63202178, 0.        , 0.63202178, 0.        , 0.        ,
         0.44843834, 0.        , 0.        ]])

In [128]:
tfidf_vectorizer.get_feature_names()

['black', 'brown', 'color', 'dog', 'fox', 'is', 'lazy', 'the']

In [129]:
# brown
import math
tf = 3
nd = 4
df = 2 

idf = math.log(nd/df) + 1
tfidf_brown = tf * idf
tfidf_brown

5.079441541679836

In [130]:
# The
import math
tf = 1
nd = 4
df = 3 

idf = math.log(nd/df) + 1
tfidf_the = tf * idf
tfidf_the

1.2876820724517808

In [131]:
# fox
import math
tf = 1
nd = 4
df = 2 

idf = math.log(nd/df) + 1
tfidf_fox = tf * idf
tfidf_fox

1.6931471805599454

In [132]:
vec = np.array([0,tfidf_brown,0,0,0,tfidf_fox, 0, 0, tfidf_the])
vec

array([0.        , 5.07944154, 0.        , 0.        , 0.        ,
       1.69314718, 0.        , 0.        , 1.28768207])

In [133]:
from numpy.linalg import norm
norm(vec)
vec/np.sqrt(np.dot(vec,vec))

array([0.        , 0.92238296, 0.        , 0.        , 0.        ,
       0.30746099, 0.        , 0.        , 0.23383201])

In [138]:
tfidf_pipe = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

In [142]:
tfidf_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [143]:
tfidf_pipe.score(X_test, y_test)

0.8564

In [144]:
from sklearn.cross_validation import cross_val_score

In [151]:
simple_scores = cross_val_score(simple_pipe, X_train, y_train, cv=10)

In [152]:
tfidf_scores = cross_val_score(tfidf_pipe, X_train, y_train, cv=10)

In [153]:
np.mean(simple_scores), np.std(simple_scores)

(0.8554493931248481, 0.00875385021170826)

In [154]:
np.mean(tfidf_scores), np.std(tfidf_scores)

(0.8610494185123546, 0.009206620423056443)

In [155]:
from sklearn.metrics import classification_report

In [158]:
print(classification_report(y_test, simple_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.84      0.87      0.86      2481
          1       0.87      0.84      0.85      2519

avg / total       0.85      0.85      0.85      5000



In [159]:
print(classification_report(y_test, tfidf_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.85      0.86      0.86      2481
          1       0.86      0.85      0.86      2519

avg / total       0.86      0.86      0.86      5000



Only slightly better since we have already removed lots of stop words :). However, the dimension of the feature space is of the size of the vocabulary and hence very large. There is another way to get a representation in a smaller vector space. This method is called word2vec.