# Sentimental Analysis on Movie Reviews NLP

## Importing Libraries

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading Dataset

In [6]:
train = pd.read_csv('train.tsv',delimiter='\t',quoting=3,nrows=3005) # quoting = 3 for no quotes consideration in dataset

In [7]:
print(train.head())

   PhraseId  ...  Sentiment
0         1  ...          1
1         2  ...          2
2         3  ...          2
3         4  ...          2
4         5  ...          2

[5 rows x 4 columns]


## Cleaning Data

In [9]:
import re
import nltk
nltk.download('stopwords') # download common words (stopwords) 'a,the,pronouns....etc' words
from nltk.corpus import stopwords # importing stopwords
from nltk.stem.porter import PorterStemmer # Importing stem which helps to reduce sparse matrix by converting words to present tense 'loved' ==> 'love'
corpus = [] # list contains all the clean words
for i in range(0,3005):
  review = re.sub('[^a-zA-Z]',' ',train['Phrase'][i]) # replace those punctuation words with space
  review = review.lower() # convert the reviews to lowercase
  review = review.split() # split those sentences to words
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')  # removing the 'not' from stopwords list
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review) # Join the words using space
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
print(corpus)

['seri escapad demonstr adag good goos also good gander occasion amus none amount much stori', 'seri escapad demonstr adag good goos', 'seri', '', 'seri', 'escapad demonstr adag good goos', '', 'escapad demonstr adag good goos', 'escapad', 'demonstr adag good goos', 'demonstr adag', 'demonstr', 'adag', '', 'adag', 'good goos', '', 'good goos', '', 'good goos', '', 'good goos', 'good', 'goos', '', 'goos', 'goos', 'also good gander occasion amus none amount much stori', 'also good gander occasion amus none amount much stori', 'also', 'also', 'good gander occasion amus none amount much stori', 'gander occasion amus none amount much stori', 'gander occasion amus none amount much stori', 'gander', 'gander', 'gander', '', 'occasion amus none amount much stori', '', '', '', '', 'occasion amus none amount much stori', 'occasion', 'amus none amount much stori', 'amus', 'none amount much stori', '', 'none amount much stori', 'none', 'amount much stori', 'amount much stori', 'amount much stori', 

## Creating the Bag of Words model

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=700)
X = cv.fit_transform(corpus).toarray()
Y = train.iloc[:, -1].values

In [12]:
print(len(X[0]))

794


## Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [16]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [17]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[  9   6   1   1   0]
 [ 37  35   4   8  14]
 [ 60  78  16 119  95]
 [ 13   7   4  33  36]
 [  1   0   0   3  21]]


0.1896838602329451

In [18]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=X_train, y=y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 19.88 %
Standard Deviation: 2.23 %


## Training the SVM on the Training set

In [19]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Predicting the Test set results

In [20]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[  5   8   4   0   0]
 [ 10  37  51   0   0]
 [  0  12 348   8   0]
 [  0   2  65  24   2]
 [  0   0   7  11   7]]


0.7004991680532446

In [21]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=X_train, y=y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 67.93 %
Standard Deviation: 1.98 %


## Training the Random Forest model on the Training set

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10,criterion = 'entropy', random_state=0)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [23]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[  5   8   4   0   0]
 [ 13  42  42   0   1]
 [  0  33 315  20   0]
 [  1   3  53  30   6]
 [  0   0   5  10  10]]


0.6688851913477537

In [24]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=X_train, y=y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 66.64 %
Standard Deviation: 2.71 %


## Training the XGBoost model on the Training set

In [25]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [26]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[  4   3  10   0   0]
 [  6   8  84   0   0]
 [  0   0 366   2   0]
 [  0   0  84   8   1]
 [  0   0  13   5   7]]


0.653910149750416

In [27]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=X_train, y=y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 64.35 %
Standard Deviation: 1.68 %
