# Sentimental Analysis



## Task 1

## Importing libraries


In [2]:
import numpy as np 
import pandas as pd 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

import nltk
import string
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Importing dataset using numpy
car_reviews = pd.read_csv('car_reviews.csv')

In [4]:
car_reviews

Unnamed: 0,Sentiment,Review
0,Neg,In 1992 we bought a new Taurus and we really ...
1,Neg,The last business trip I drove to San Franci...
2,Neg,My husband and I purchased a 1990 Ford F250 a...
3,Neg,I feel I have a thorough opinion of this truc...
4,Neg,AS a mother of 3 all of whom are still in ca...
...,...,...
1377,Pos,In June we bought the Sony Limited Edition Fo...
1378,Pos,After 140 000 miles we decided to replace my...
1379,Pos,The Ford Focus is a great little record setti...
1380,Pos,I needed a new car because my hyundai excel 9...


In [5]:
# Splitting data to sentiments and comments
comments = car_reviews.iloc[:,1]
sentiments = car_reviews.iloc[:,0]

## Cleaner function:
Filters punctuations, remove numbers and special characters, converts to lower characters, excludes stopwords, includes word stemming

In [6]:
def text_cleaner(review):
    contents = []
    porter = PorterStemmer()
    stopwords = nltk.corpus.stopwords.words("english")                 # Initialize stopwords
    for index in range(review.shape[0]):
        bow_dict = {}
        reviews = re.sub('[^a-zA-Z]', ' ', review.iloc[index])         # Removes punctuations
        reviews = reviews.lower()                                      # Converts to lower case
        original = nltk.word_tokenize(reviews)                         # Tokenize
        for words in original:
            new_word = porter.stem(words)
            if words not in stopwords and new_word not in bow_dict:
                bow_dict[new_word] = 1
            elif words not in stopwords and new_word in bow_dict:
                bow_dict[new_word] += 1
        contents.append(bow_dict)
    return contents

# Generates bag of words for comments
bagOfWords = text_cleaner(comments)

<strong>Converts bag of words to array containing binary values indicating whether a word/stem occurs in the review</strong>

In [7]:
bagOfWords

[{'bought': 2,
  'new': 3,
  'tauru': 3,
  'realli': 1,
  'love': 1,
  'decid': 1,
  'tri': 1,
  'care': 1,
  'style': 3,
  'newer': 1,
  'version': 1,
  'anyway': 1,
  'like': 2,
  'car': 1,
  'half': 1,
  'much': 2,
  'one': 1,
  'thee': 1,
  'dash': 1,
  'deep': 1,
  'take': 1,
  'lot': 1,
  'room': 1,
  'find': 1,
  'seat': 2,
  'comfort': 1,
  'way': 1,
  'side': 1,
  'stick': 1,
  'strip': 1,
  'protect': 1,
  'card': 1,
  'dent': 1,
  'drive': 1,
  'nice': 2,
  'good': 1,
  'pick': 1,
  'see': 1,
  'hood': 1,
  'driver': 1,
  'judg': 1,
  'park': 1,
  'difficult': 1,
  'small': 1,
  'ga': 1,
  'tank': 1,
  'would': 2,
  'buy': 1,
  'rather': 1,
  'back': 1,
  'think': 1,
  'mistak': 1,
  'chang': 1,
  'less': 1,
  'month': 1,
  'dead': 1,
  'batteri': 1,
  'flat': 1,
  'tire': 1},
 {'last': 1,
  'busi': 2,
  'trip': 7,
  'drove': 1,
  'san': 1,
  'francisco': 1,
  'went': 1,
  'hertz': 1,
  'rental': 1,
  'got': 1,
  'ford': 2,
  'tauru': 5,
  'think': 2,
  'look': 1,
  'comfort

In [10]:
#A Vector containing the number of times a stem word appears in the review. 
print(bagOfWords[0].values())
print(" ")
print(bagOfWords[0].keys())

#Highlights the data wrangling
print(bagOfWords[0])
print(" ")

#Checking whether the cleaner and stemmer working
print(bagOfWords[0]['tauru'])
print(bagOfWords[0]['thee'])
print(bagOfWords[1]['comfort'])
print(" ")


dict_values([2, 3, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
dict_keys(['bought', 'new', 'tauru', 'realli', 'love', 'decid', 'tri', 'care', 'style', 'newer', 'version', 'anyway', 'like', 'car', 'half', 'much', 'one', 'thee', 'dash', 'deep', 'take', 'lot', 'room', 'find', 'seat', 'comfort', 'way', 'side', 'stick', 'strip', 'protect', 'card', 'dent', 'drive', 'nice', 'good', 'pick', 'see', 'hood', 'driver', 'judg', 'park', 'difficult', 'small', 'ga', 'tank', 'would', 'buy', 'rather', 'back', 'think', 'mistak', 'chang', 'less', 'month', 'dead', 'batteri', 'flat', 'tire'])
{'bought': 2, 'new': 3, 'tauru': 3, 'realli': 1, 'love': 1, 'decid': 1, 'tri': 1, 'care': 1, 'style': 3, 'newer': 1, 'version': 1, 'anyway': 1, 'like': 2, 'car': 1, 'half': 1, 'much': 2, 'one': 1, 'thee': 1, 'dash': 1, 'deep': 1, 'take': 1, 'lot': 1, 'room': 1, 'find': 1, 'seat': 2, 'comfort

In [11]:
#Generating bag of words vector - containing occurence of each word in the corpus
bagOfWords_vector = DictVectorizer(sparse = False).fit_transform(bagOfWords)

In [12]:
bagOfWords_vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

<strong>Using sklearn model, splits vector into training and testing data, with train data 80% of total data set or 1106<strong>

In [13]:
#Selected trainsize as 0.8003 % to take 1106 training samples
X_train, X_test, Y_train, Y_test = train_test_split(bagOfWords_vector, sentiments, train_size = 0.8003, random_state = 0)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1106, 9872)
(276, 9872)
(1106,)
(276,)


<strong>Naive Bayes classifier</strong>

In [15]:
clf = MultinomialNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)

In [16]:
Y_pred

array(['Neg', 'Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Neg', 'Neg',
       'Neg', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Pos',
       'Neg', 'Neg', 'Pos', 'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Neg',
       'Neg', 'Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg',
       'Neg', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Pos', 'Neg', 'Pos',
       'Pos', 'Pos', 'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Neg', 'Pos',
       'Pos', 'Neg', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg',
       'Pos', 'Neg', 'Neg', 'Neg', 'Pos', 'Neg', 'Pos', 'Neg', 'Pos',
       'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Neg', 'Neg', 'Pos', 'Neg',
       'Neg', 'Pos', 'Pos', 'Neg', 'Pos', 'Pos', 'Pos', 'Neg', 'Pos',
       'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Neg',
       'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Neg', 'Neg', 'Pos',
       'Pos', 'Neg', 'Neg', 'Neg', 'Neg', 'Pos', 'Pos', 'Neg', 'Pos',
       'Neg', 'Neg', 'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Neg', 'Pos',
       'Neg', 'Pos',

<strong>Predicting score</strong>

In [17]:
score = accuracy_score(Y_test, Y_pred)
print(score)

0.7572463768115942


<strong>Confusion matrix</strong>

In [18]:
cf = confusion_matrix(Y_test, Y_pred)

In [19]:
cf

array([[105,  44],
       [ 23, 104]], dtype=int64)

In [20]:
#Performance 
performance = np.trace(cf)/cf.sum()
print(performance)

#True Negative Rate
TN_rate = cf[0][0]/(cf[0][0] + cf[0][1])
print("True negative rate", TN_rate)
#True Positive Rate
TP_rate = cf[1][1]/(cf[1][0] + cf[1][1])
print("True positive rate", TP_rate)
#False Negative Rate
FN_rate = cf[1][0]/(cf[1][0] + cf[1][1])
print("False Negative rate", FN_rate)
#False Positive Rate
FP_rate = cf[0][1]/(cf[0][0] + cf[0][1])
print("False Positive rate", FP_rate)

0.7572463768115942
True negative rate 0.7046979865771812
True positive rate 0.8188976377952756
False Negative rate 0.18110236220472442
False Positive rate 0.2953020134228188


# Task 2
### Choosing different classifiers and comparing performance

<strong>Here, I have decided to go with Bagging / Bootstrap Aggregating classifier as it provides slightly better accuracy than Naive Bayes. 
This is because:
1. Bagging is an ensemble learning technique that combines prediction of multiple base models as compared to Naive Bayes
2. It avoids over fitting of data, and avoids variance (which is one of the issue in Naive Bayes) - These methods are used as a way to reduce the variance of a base estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. Bagging improves with respect to a single model, without making it necessary to adapt the underlying base algorithm. As they provide a way to reduce overfitting, bagging methods work best with strong and complex models.
3. Naive Bayes is a generalized algorithm which is better when we want to classify a small corpus of data having a relatively small amount of input features, you don’t expect the inputs to be meaningfully correlated. Because our corpus is a very large set, Bagging does a better job at classification.
Reference :</strong> 
1. https://www.researchgate.net/publication/2453583_Online_Bagging_and_Boosting#pf4
2. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
3. https://arxiv.org/ftp/arxiv/papers/1812/1812.11587.pdf

In [21]:
#Bagging classifier

#imports the base estimator SVC i.e Support Vector Classifier - The BaseEstimator just gives the get_params and set_params methods that all Scikit-learn estimators require
from sklearn.svm import SVC

#imports the Bagging Classifier ensemble model
from sklearn.ensemble import BaggingClassifier

#Inititalizes the bg_clf : 
# 1. base_estimator - A bagging classifier with base estimator SVC 
# 2. n_estimators - Number of base estimators in ensemble , here it is taken as 1 as there is only one base estimator and highest accuracy is observed in this
# 3. random_state - Controls the random resampling of the original dataset (sample wise and feature wise). If the base estimator accepts a random_state attribute, a different seed is generated for each instance in the ensemble. Passes an int for reproducible output across multiple function calls.
#Finally fits the X_train and Y_train in the bagging classifier 
bg_clf = BaggingClassifier(base_estimator=SVC(), n_estimators=1, random_state=8).fit(X_train, Y_train)

#Predicts the output of bg_clf
new_pred = bg_clf.predict(X_test)

In [22]:
#Improved score when compared to Naive Bayes
score = accuracy_score(Y_test, new_pred)
print(score)

0.7753623188405797


<strong>Confusion matrix and performance for Bagging classifier</strong>

In [23]:
bg_cf = confusion_matrix(Y_test, new_pred)

In [24]:
#Performance for Bagging Classifier
performance = np.trace(bg_cf)/bg_cf.sum()
print(performance)

#True Negative Rate
TN_rate = bg_cf[0][0]/(bg_cf[0][0] + bg_cf[0][1])
print("True negative rate", TN_rate)
#True Positive Rate
TP_rate = bg_cf[1][1]/(bg_cf[1][0] + bg_cf[1][1])
print("True positive rate", TP_rate)
#False Negative Rate
FN_rate = bg_cf[1][0]/(bg_cf[1][0] + bg_cf[1][1])
print("False Negative rate", FN_rate)
#False Positive Rate
FP_rate = bg_cf[0][1]/(bg_cf[0][0] + bg_cf[0][1])
print("False Positive rate", FP_rate)

0.7753623188405797
True negative rate 0.7181208053691275
True positive rate 0.84251968503937
False Negative rate 0.15748031496062992
False Positive rate 0.28187919463087246
