In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer 

from sklearn.metrics import roc_auc_score

In [2]:
def extract_reviews(path):
    filenames = os.listdir(path)
    reviews = []
    for filename in filenames:
        with open(path+filename) as f:
            reviews.append(f.read())
    df = pd.DataFrame((reviews),columns=['reviews'])
    return df

In [3]:
train_pos_df = extract_reviews('train/pos/')
train_neg_df = extract_reviews('train/neg/')

In [4]:
train_pos_df['sentiment'] = np.ones(len(train_pos_df), dtype=np.int8)
train_pos_df.head()           

Unnamed: 0,reviews,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1


In [5]:
train_neg_df['sentiment'] = np.zeros(len(train_neg_df), dtype=np.int8)
train_neg_df.head()

Unnamed: 0,reviews,sentiment
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0


In [6]:
train_df = pd.concat([train_pos_df, train_neg_df],axis=0, ignore_index=True)
train_df.shape

(25000, 2)

- Shuffle the rows of the dataframe so that there is a random mix of postive and negative reviews. This step is necessary for performing cross-validation later.

In [11]:
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
train_df.head()

Unnamed: 0,reviews,sentiment
0,Great little thriller. I was expecting some ty...,1
1,"Nothing could have saved this movie, not even ...",0
2,This was a good movie. It wasn't your typical ...,1
3,From the pen of Richard Condon (The Manchurian...,0
4,I suppose that today this film has relevance b...,0


In [143]:
# similarly load the test dataset

test_pos_df = extract_reviews('test/pos/')
test_neg_df = extract_reviews('test/neg/')

test_pos_df['sentiment'] = np.ones(len(test_pos_df), dtype=np.int8)
test_neg_df['sentiment'] = np.zeros(len(test_neg_df), dtype=np.int8)

test_df = pd.concat([test_pos_df, test_neg_df],axis=0, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)


In [145]:
y_test = test_df.sentiment
test_df = test_df.drop('sentiment',axis=1)
test_df.head()

Unnamed: 0,reviews
0,Yul Brynner was a symbol of villein in the tin...
1,This show has been performed live around the c...
2,To sum this story up in a few sentences: A tee...
3,This is absolutely beyond question the worst m...
4,A box with a button provides a couple with the...


In [146]:
train_reviews = train_df.reviews
test_reviews = test_df.reviews

In [24]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'([a-zA-Z]+)') 



def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(review):
    '''
    This function preprocesses the comments and sets them up for vectorization.
    Input: comment string
    Returns: A string after converting the words to lowercase, removing punctuations, and lemmatizing each word
    '''
    words = [word for word in tokenizer.tokenize(review.lower()) if not word in stop_words]  # convert to lowercase and remove stopwords
    clean_words = [word for word in words if len(word)>2]
    lemmatized_review = ' '.join([lemmatizer.lemmatize(word,pos= get_wordnet_pos(word)) for word in clean_words]) ## lemmatization
    return lemmatized_review  

In [25]:
train_reviews = train_reviews.apply(lambda review: preprocess_text(review))
test_reviews = test_reviews.apply(lambda review: preprocess_text(review))

In [131]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [175]:
tfidf_vectorizer = TfidfVectorizer(
    strip_accents= 'unicode', analyzer='word', max_df = 0.5, ngram_range=(1, 2), sublinear_tf=True,                                  
     max_features=30000)

In [260]:
train_features = tfidf_vectorizer.fit_transform(train_reviews)
test_features = tfidf_vectorizer.transform(test_reviews)

In [177]:
train_features.shape

(25000, 30000)

In [261]:
feature_names = tfidf_vectorizer.get_feature_names()

In [262]:
tfidf_scores = np.sum(train_features.A, axis=0,keepdims=False)
p=[]
for tag, tfidf_score in zip(feature_names, tfidf_scores):
    p.append((tag, tfidf_score))
    
tfidf_scores_df = pd.DataFrame(p,columns=['token', 'tf-idf score']).sort_values(by = 'tf-idf score', ascending=False)
tfidf_scores_df.head() ## top 5 tokens in the corpus by tf-idf score

Unnamed: 0,token,tf-idf score
10948,he,396.839791
11454,his,395.151154
25656,they,369.771231
25960,this movie,368.299053
22172,so,363.731333


In [263]:
y_train = train_df['sentiment']
#X_train, X_test, y_train, y_test = train_test_split(train_features, y, test_size=0.3, random_state=42)

In [183]:
def grid_search(clf, parameters, X, y, n_jobs=-1, n_folds=5, score_func=None,verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func,verbose =verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=5, verbose =verbose)
    gs.fit(X, y)
    print ("Best parameter values: {} and best score = {}".format(gs.best_params_ , gs.best_score_))
    best = gs.best_estimator_
    return best

In [713]:
clf_mulNB = MultinomialNB()
parameters = {'alpha': np.arange(0.001, 0.01,0.001)}
mulNB_model = grid_search(clf_mulNB, parameters, train_features, y_train, n_folds=5, score_func='accuracy')

Best parameter values: {'alpha': 0.004} and best score = 0.8804


In [714]:
clf_mulNB_pred = mulNB_model.predict(test_features)

In [708]:
def show_metrics(y_test,y_pred):
    print(classification_report(y_test, y_pred ))
    display(pd.DataFrame(confusion_matrix(y_test, y_pred), 
                         columns= ['Predicted -ve', 'Predicted +ve'], index = ['Actual -ve', 'Actual +ve']))
    print('The AUC (under ROC curve) score is {}'.format(roc_auc_score(y_test, y_pred)))

In [715]:
show_metrics(y_test, clf_mulNB_pred)

              precision    recall  f1-score   support

           0       0.83      0.84      0.84     12500
           1       0.84      0.83      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



Unnamed: 0,Predicted -ve,Predicted +ve
Actual -ve,10468,2032
Actual +ve,2077,10423


The AUC (under ROC curve) score is 0.8356399999999999


### Logistic regression

In [731]:
clf_logreg = LogisticRegression(solver='sag',random_state=42)
parameters = {'C': np.arange(1,12,1)}
logreg_model = grid_search(clf_logreg, parameters, train_features, y_train, n_folds=10, score_func='accuracy')

Best parameter values: {'C': 5} and best score = 0.90564


In [732]:
logreg_pred = logreg_model.predict(test_features)
show_metrics(y_test, logreg_pred)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, logreg_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87     12500
           1       0.87      0.88      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



Unnamed: 0,Predicted -ve,Predicted +ve
Actual -ve,10805,1695
Actual +ve,1519,10981


The AUC (under ROC curve) score is 0.8714400000000001
0.87144


### Random forest

In [134]:
clf_rf = RandomForestClassifier(random_state=42,n_jobs=-1, min_samples_split=5) 
parameters = {'n_estimators': [700,1000], 'max_depth': [7,10]}
rfmodel = grid_search(clf_rf, parameters, train_features, y_train, n_folds=3)

BEST {'max_depth': 10, 'n_estimators': 1000} 0.84116


In [736]:
clf_rf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42,n_jobs=-1, min_samples_split=5) 
clf_rf.fit(train_features,y_train)
rf_pred = clf_rf.predict(test_features)
show_metrics(y_test, rf_pred)

              precision    recall  f1-score   support

           0       0.93      0.57      0.71     12500
           1       0.69      0.96      0.80     12500

    accuracy                           0.77     25000
   macro avg       0.81      0.77      0.76     25000
weighted avg       0.81      0.77      0.76     25000



Unnamed: 0,Predicted -ve,Predicted +ve
Actual -ve,7152,5348
Actual +ve,520,11980


The AUC (under ROC curve) score is 0.7652800000000001


## Using word embeddings (word2vec)

The results from the different classifiers are comparable and acceptable. I think it might be possible to crank up the accuracy and other metrics (precision, recall) by tweaking hyperparameters a bit more, or by using other classification algorithms. But now it's time to experiment with Word2vec, a word embedding technique which provided a fresh impetus to the NLP community since the original paper by Mikolov et al. in 2013.

The following section is inspired by this [Kaggle tutorial](https://www.kaggle.com/c/word2vec-nlp-tutorial/overview/part-3-more-fun-with-word-vectors) on Word2vec.

In [529]:
from nltk.tokenize import sent_tokenize
s = 'Mr. Bean came home yesterday. Whoo! are you coming?'
sent_tokenize(s)

['Mr. Bean came home yesterday.', 'Whoo!', 'are you coming?']

In [590]:
def text_to_words( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    regex_tokenizer = RegexpTokenizer(r'([a-zA-Z]+)') 
    words = regex_tokenizer.tokenize(review.lower())
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    return words

In [591]:
from nltk.tokenize import sent_tokenize

def review_to_sentences(review, remove_stopwords=False ):
    '''
    Function to split a review into parsed sentences. 
    Returns a list of sentences, where each sentence is a list of words.
    '''
    sentences = sent_tokenize(review.strip())  # splits the paragraph in to sentences
    sentences_list = []
    for sentence in sentences:
        # If a sentence is empty, skip it
        if len(sentence) > 0:
        # Otherwise, call sentence_to_words to get a list of words
            sentences_list.append(text_to_words(sentence,remove_stopwords)) # split the sentence into words
    return sentences_list

In [549]:
sentences = []  # Initialize an empty list of sentences
for review in train_df["reviews"]:
    sentences += review_to_sentences(review)

for review in test_df["reviews"]:
    sentences += review_to_sentences(review)

In [653]:
# Import the built-in logging module and configure it so that Word2Vec 
import logging
import time
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
    
# Set parameter values for word2vec model
num_features = 200    # Word vector dimensionality                      
min_word_count = 30   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
start = time.time()
model = word2vec.Word2Vec(sentences, workers=num_workers, 
            size=num_features, min_count = min_word_count, 
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
end = time.time()
print('Took {} mins.'.format((end - start)/60))
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2019-08-25 17:56:31,167 : INFO : collecting all words and their counts
2019-08-25 17:56:31,178 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-08-25 17:56:31,351 : INFO : PROGRESS: at sentence #10000, processed 225235 words, keeping 17080 word types
2019-08-25 17:56:31,508 : INFO : PROGRESS: at sentence #20000, processed 447525 words, keeping 24202 word types
2019-08-25 17:56:31,662 : INFO : PROGRESS: at sentence #30000, processed 669440 words, keeping 28984 word types
2019-08-25 17:56:31,819 : INFO : PROGRESS: at sentence #40000, processed 893873 words, keeping 32874 word types
2019-08-25 17:56:31,960 : INFO : PROGRESS: at sentence #50000, processed 1115940 words, keeping 36470 word types
2019-08-25 17:56:32,121 : INFO : PROGRESS: at sentence #60000, processed 1341511 words, keeping 39534 word types
2019-08-25 17:56:32,254 : INFO : PROGRESS: at sentence #70000, processed 1569482 words, keeping 42470 word types
2019-08-25 17:56:32,392 : INFO : PROGRESS: 

2019-08-25 17:56:49,363 : INFO : EPOCH 1 - PROGRESS: at 94.89% examples, 796205 words/s, in_qsize 7, out_qsize 0
2019-08-25 17:56:49,880 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-08-25 17:56:49,893 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-08-25 17:56:49,901 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-08-25 17:56:49,915 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-25 17:56:49,916 : INFO : EPOCH - 1 : training on 11911303 raw words (8438083 effective words) took 10.6s, 794945 effective words/s
2019-08-25 17:56:50,924 : INFO : EPOCH 2 - PROGRESS: at 9.42% examples, 798631 words/s, in_qsize 7, out_qsize 0
2019-08-25 17:56:51,926 : INFO : EPOCH 2 - PROGRESS: at 18.90% examples, 801824 words/s, in_qsize 7, out_qsize 0
2019-08-25 17:56:52,933 : INFO : EPOCH 2 - PROGRESS: at 28.46% examples, 798991 words/s, in_qsize 7, out_qsize 0
2019-08-25 17:56:53,936 : INFO : EPOCH 2 - PRO

Took 1.0581315636634827 mins.


2019-08-25 17:57:35,000 : INFO : saved 300features_40minwords_10context


In [654]:
# Cosine similarity between any two words in the vocabulary
model.wv.similarity('movie', 'thriller')

0.44209871284575264

In [655]:
# Top 10 similar words
model.wv.most_similar('thriller')

[('drama', 0.7082782983779907),
 ('giallo', 0.6919577717781067),
 ('melodrama', 0.6373547911643982),
 ('farce', 0.6364995241165161),
 ('flick', 0.6363845467567444),
 ('yarn', 0.6272774934768677),
 ('suspense', 0.6271798610687256),
 ('mystery', 0.5976229906082153),
 ('thrillers', 0.5886460542678833),
 ('chiller', 0.5811585187911987)]

In [656]:
## The vectors are normalized
np.sum(model.wv['thriller']**2)

0.99999994

In [657]:
# No. of words in the vocabulary
len(model.wv.vocab)

15366

In [658]:
# Each row is the word vector for that word
model.wv.vectors.shape

(15366, 200)

**Now we would represent each piece of review with a vector as follows:**
1. Loop over each word in the review, if the word is in the vocabulary, get its word vector.
2. Sum all the word vectors found in this way and divide by the number of vectors.

In [659]:
def text_to_feature_vector(model,words):
    '''Returns the average of all the word vectors corresponding to a particular review'''
    vector_sum = np.zeros(num_features,)
    c=0
    for word in words:
        if word in model.wv.vocab:
            vector_sum  = vector_sum + model.wv[word]
            c = c+1
    vector_sum /= c
    return vector_sum

In [660]:
train_wordvec_features = np.empty((train_df.shape[0], num_features))
test_wordvec_features = np.empty((test_df.shape[0], num_features))

for row in range(train_df.shape[0]):  
    train_wordvec_features[row,:] = text_to_feature_vector(model, text_to_words(train_df['reviews'][row],remove_stopwords=True))

for row in range(test_df.shape[0]):
    test_wordvec_features[row,:] = text_to_feature_vector(model, text_to_words(test_df['reviews'][row], remove_stopwords=True))

print(train_wordvec_features.shape)
print(test_wordvec_features.shape)

(25000, 200)
(25000, 200)


**Logistic regression**

In [734]:
clf_logreg_word2vec = LogisticRegression(solver='sag',random_state=42)
parameters = {'C': np.arange(20,40,2)}
logreg_model_word2vec = grid_search(clf_logreg, parameters, train_wordvec_features, y_train, n_folds=10, score_func='accuracy')

Best parameter values: {'C': 38} and best score = 0.86688


In [735]:
logreg_pred_word2vec_tr = logreg_model_word2vec.predict(test_wordvec_features)
logreg_pred_word2vec_tr = LogisticRegression(C=100,solver='sag',random_state=42).fit(train_wordvec_features,y_train).predict(test_wordvec_features)
show_metrics(y_test,logreg_pred_word2vec_tr)
print(accuracy_score(y_test,logreg_pred_word2vec_tr))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87     12500
           1       0.87      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



Unnamed: 0,Predicted -ve,Predicted +ve
Actual -ve,10898,1602
Actual +ve,1723,10777


The AUC (under ROC curve) score is 0.867
0.867


**Misclassification example**

In [696]:
print('Predicted: %i'%logreg_pred_word2vec_tr[2])
print('Actual: %i'%y_test[2])

Predicted: 0
Actual: 1


In [690]:
print(test_df.iloc[2]['reviews'])

To sum this story up in a few sentences: A teenage girl (Amy) uses her hot body and "supposed" virginity to entice a young troubled guy (Matt) with a potential football scholarship to provide her a "Full Ride" out of town. Come to find out she has quite the reputation & has slept with many football players in the past hoping they would offer her the same deal. Both of these kids have come from troubled & dysfunctional homes. Matt's mothers a alcoholic who repeatedly embarrasses him in front of his friends & Amy's mother had a bad reputation herself & got pregnant with Amy at a a young age. Matt falls in love with Amy & tries to straighten out his life for her. Very predictable ending. The actress that plays "Amy" is actually 33 years old trying to play a teenager!


*To be honest, I don't see how the above review is actually classified as positive. I'd think it rather has negative connotations.*

**Random forest**

In [679]:
clf_rf_word2vec = RandomForestClassifier(n_estimators=600, max_depth=15, random_state=42,n_jobs=-1, min_samples_split=3) 
clf_rf_word2vec.fit(train_wordvec_features,y_train)
rf_pred_word2vec = clf_rf_word2vec.predict(test_wordvec_features)

show_metrics(y_test,rf_pred_word2vec)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83     12500
           1       0.83      0.83      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



Unnamed: 0,Predicted -ve,Predicted +ve
Actual -ve,10407,2093
Actual +ve,2142,10358


The AUC (under ROC curve) score is 0.8306


# Neural nets: Keras