# Sentiment Analysis 

# Preprocessing movie dataset into more convenient format

In [84]:
import pyprind 
import pandas as pd  
import os

In [85]:
basepath = './data/aclImdb' 

/Users/krish.mahajan@ibm.com/Desktop/Documents_Desktop/Github/amazon-movie-reviews

In [44]:
labels = {'pos':1,'neg':0} 
pbar = pyprind.ProgBar(50000) 
df = pd.DataFrame() 
for s in ('test' , 'train') :
    for l in ('pos' , 'neg'):
        path = os.path.join(basepath, s , l) 
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r',encoding='utf-8') as infile: 
                txt = infile.read() 
                df = df.append([[txt,labels[l]]],ignore_index=True) 
                pbar.update() 
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:06


In [45]:
df.head()

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [46]:
df.tail()

Unnamed: 0,review,sentiment
49995,"My comments may be a bit of a spoiler, for wha...",0
49996,"The ""saucy"" misadventures of four au pairs who...",0
49997,"Oh, those Italians! Assuming that movies about...",0
49998,Eight academy nominations? It's beyond belief....,0
49999,"Not that I dislike childrens movies, but this ...",0


In [47]:
df.iloc[1,:]

review       This is a gem. As a Film Four production - the...
sentiment                                                    1
Name: 1, dtype: object

## Shuffling  the dataframe with permuations and creating final data csv

In [48]:
type.mro(type(df.index))

[pandas.core.indexes.range.RangeIndex,
 pandas.core.indexes.numeric.Int64Index,
 pandas.core.indexes.numeric.IntegerIndex,
 pandas.core.indexes.numeric.NumericIndex,
 pandas.core.indexes.base.Index,
 pandas.core.base.IndexOpsMixin,
 pandas.core.base.PandasObject,
 pandas.core.base.StringMixin,
 pandas.core.accessor.DirNamesMixin,
 object]

In [49]:
import numpy as np
np.random.seed(0) 
df = df.reindex(np.random.permutation(df.index)) 
df.to_csv('./data/movie_data.csv',index=False,encoding='utf-8')

## Reading the final data file 

In [86]:
df = pd.read_csv('./data/movie_data.csv' , encoding = 'utf-8')

In [87]:
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


# Text Data --> Word Vectors
## 1. Bag of words model 1- gram

In [88]:
# Sample review 
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer() 
docs = np.array(['The sun is shinning',
                 'The weather is sweet',
                 'The sun is shinning and weather is sweet']) 
bag = count.fit_transform(docs)

In [37]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shinning': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [38]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 1 1]]


 ### 1-gram --> "the" , "sun" , "is" , "shining"   
 ### 2-gram ---> "the sun" , "sun is" , "is shining"

## 2. Word relevancy via term frequency-inverse document frequency 
#### **tf-idf(t,d) = tf(t,d) X idf(t,d)**

In [52]:
from sklearn.feature_extraction.text import TfidfTransformer 
tfidf = TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True) 
np.set_printoptions(precision=2) 
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.44 0.53 0.34 0.34 0.34 0.26 0.34]]


#  Text Cleaning & processing text into documents & stop words removal

In [53]:
df.loc[0 , 'review']

'My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of "Nasaan ka man" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!'

In [89]:
#data cleaning 
import re 
def preprocessor(text):
    text = re.sub('<[^>]*>','',text) 
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text) 
    text = (re.sub('[\W]+',' ',text.lower())+ ' '.join(emoticons).replace('-',''))
    return text

In [90]:
df['review'] = df['review'].apply(preprocessor)

In [91]:
#cleaned version 
df.loc[0,'review']

'my family and i normally do not watch local movies for the simple reason that they are poorly made they lack the depth and just not worth our time the trailer of nasaan ka man caught my attention my daughter in law s and daughter s so we took time out to watch it this afternoon the movie exceeded our expectations the cinematography was very good the story beautiful and the acting awesome jericho rosales was really very good so s claudine barretto the fact that i despised diether ocampo proves he was effective at his role i have never been this touched moved and affected by a local movie before imagine a cynic like me dabbing my eyes at the end of the movie congratulations to star cinema way to go jericho and claudine '

In [92]:
#Tokenization 
def tokenizer(text):
    return text.split()

In [93]:
#Stemming 
from nltk.stem.porter import PorterStemmer 
porter = PorterStemmer() 
def tokenizer_porter(text): 
    return [porter.stem(word) for word in text.split()]

In [94]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [95]:
#stopwords removing 
import nltk 
nltk.download('stopwords') 
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krish.mahajan@ibm.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
[w for w in tokenizer_porter('a runner like running and thus they runs a lot') if w not in stop]

['runner', 'like', 'run', 'thu', 'run', 'lot']

# Training a logistic regression model for document classification

In [97]:
# First dividing the DataFrame of cleaned text document into 25,000 documents for training and 25000 documents for testing 

X_train = df.loc[:25000,'review'].values 
y_train = df.loc[:25000,'sentiment'].values 
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [98]:
#Implementing GridSearchCV to find the optimal set of parameters for the logistic regression model using 5 fold stratified cross validation
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer 



In [99]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None) 
param_grid = [{'vect__ngram_range':[(1,1)],
               'vect__stop_words' :[stop,None],
            'vect__tokenizer':[tokenizer,tokenizer_porter],
              'clf__penalty' :['l1','l2'],
              'clf__C':[1.0, 10.0,100.0]}] 
lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf' ,LogisticRegression(random_state=0))]) 

gs_lr_tfidf= GridSearchCV(lr_tfidf,param_grid,scoring='accuracy',cv=2,verbose=1,n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 16.6min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...ion tokenizer_porter at 0x1a1f493950>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=T

In [100]:
print('Best Parameter set: %s ' % gs_lr_tfidf.best_params_)

Best Parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x1a1f493c80>} 


In [102]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

CV Accuracy: 0.886


In [104]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test,y_test))

Test Accuracy: 0.900


# Working with bigger data - online algorithms and out-of-core learning

In [118]:
import numpy as np 
import re 
from nltk.corpus import stopwords 
stop = stopwords.words('english') 

def tokenizer(text):
    text = re.sub('<[^>]*>','',text) 
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text) 
    text = (re.sub('[\W]+',' ',text.lower())+ ' '.join(emoticons).replace('-','')) 
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [119]:
def stream_docs(path):
    with open(path,'r' , encoding='utf-8') as csv:
        next(csv) # skip header 
        for line in csv:
            text , label = line[:-3] , int(line[-2]) 
            yield text,label 
            
def get_minitbatch(doc_stream,size):
    docs , y = [] , [] 
    try:
        for _ in range(size) :
            text , label = next(doc_stream) 
            docs.append(text) 
            y.append(label)
    except StopIteration:
        return None,None 
    return docs,y

In [120]:
from sklearn.feature_extraction.text import HashingVectorizer 
from sklearn.linear_model import SGDClassifier 
vect = HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log',random_state=1,n_iter=1)
doc_stream = stream_docs(path='./data/movie_data.csv')

In [121]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1]) 
for _ in range(45):
    X_train,y_train = get_minitbatch(doc_stream,size=1000) 
    if not X_train:
        break 
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:23


In [122]:
X_test, y_test = get_minitbatch(doc_stream,size=5000)
X_test = vect.transform(X_test)
print('Accuracy : %.3f' % clf.score(X_test,y_test)) 

Accuracy : 0.868


In [117]:
X_test

# Serializing fitted scikit-learn estimators 

In [125]:
import pickle 
import os
dest = os.path.join('movieclassifier','pkl_objets') 

if not os.path.exists(dest):
    os.makedirs(dest) 
    
#Serializing stop words
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=4)  

#Serializing classifier model
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=4) 

# Topic Modeling with Latent Dirichlet Allocation 


In [76]:
import pandas as pd 
df = pd.read_csv('./data/movie_data.csv', encoding='utf-8') 


In [77]:
from sklearn.feature_extraction.text import CountVectorizer 
count = CountVectorizer(stop_words='english',max_df=.1,max_features=5000)
X = count.fit_transform(df['review'].values)

In [78]:
from sklearn.decomposition import LatentDirichletAllocation 
lda = LatentDirichletAllocation(n_topics=10,random_state=123,learning_method='batch')
X_topics = lda.fit_transform(X)

In [79]:
lda.components_.shape

(10, 5000)

In [80]:
n_top_words = 5
feature_names = count.get_feature_names() 
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" %(topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words -1:-1]]))

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father girl children
Topic 3:
american dvd war music tv
Topic 4:
human audience cinema art feel
Topic 5:
police guy car dead murder
Topic 6:
horror house gore blood sex
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes season
Topic 9:
book version original effects read
Topic 10:
action fight guy guys cool


In [82]:
horror = X_topics[:,5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror Movie #%d:' %(iter_idx +1)) 
    print(df['review'][movie_idx][:300],'...')


Horror Movie #1:
Emilio Miraglia's first Giallo feature, The Night Evelyn Came Out of the Grave, was a great combination of Giallo and Gothic horror - and this second film is even better! We've got more of the Giallo side of the equation this time around, although Miraglia doesn't lose the Gothic horror stylings tha ...

Horror Movie #2:
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

Horror Movie #3:
This film marked the end of the "serious" Universal Monsters era (Abbott and Costello meet up with the monsters later in "Abbott and Costello Meet Frankentstein"). It was a somewhat desparate, yet fun attempt to revive the classic monsters of the Wolf Man, Frankenstein's monster, and Dracula one "la ...
