## **Libraries to be installed beforehand**

In [10]:
# !pip3 install openpyxl
# !pip3 install xlrd

In [11]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import string
import pickle
import xlrd

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# from sklearn.svm import LinearSVC
# from sklearn.ensemble import RandomForestClassifier


## **Reading dataset using openpyxl engine since xlrd is now discontinued**

In [12]:
dataset = pd.read_excel('dataset.xlsx', engine='openpyxl')
# dataset = pd.read_excel('dataset.xlsx')

In [13]:
dataset.head()

Unnamed: 0,URL,Text,Sentiment
0,http://www.imdb.com/title/tt0210075/usercomments,Girlfight follows a project dwelling New York ...,POS
1,http://www.imdb.com/title/tt0337640/usercomments,Hollywood North is an euphemism from the movie...,POS
2,http://www.imdb.com/title/tt0303549/usercomments,That '70s Show is definitely the funniest show...,POS
3,http://www.imdb.com/title/tt0716825/usercomments,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,http://www.imdb.com/title/tt0182225/usercomments,"A series of random, seemingly insignificant th...",POS


**Cleaning and analysing dataset and its sturcture**

In [14]:
dataset.isnull().sum()

URL          0
Text         0
Sentiment    0
dtype: int64

In [15]:
dataset.drop('URL', axis='columns', inplace=True)

In [16]:
dataset.head()

Unnamed: 0,Text,Sentiment
0,Girlfight follows a project dwelling New York ...,POS
1,Hollywood North is an euphemism from the movie...,POS
2,That '70s Show is definitely the funniest show...,POS
3,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,"A series of random, seemingly insignificant th...",POS


In [17]:
print(f'Rows: {dataset.shape[0]}\nColumns: {dataset.shape[1]}')

Rows: 1999
Columns: 2


In [18]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['Text', 'Sentiment']


### **Tokenization of sentences**

In [19]:
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [20]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

### **Transform and vectorization**

In [21]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [22]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1)) 
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

### **Dataset splitting**

In [23]:
X = dataset['Text']
y = dataset['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

## **Bernoulli Naïve Bayes (BNB)**

In [24]:
from sklearn.naive_bayes import BernoulliNB


In [26]:
classifier = BernoulliNB()
BNBmodel = Pipeline([("cleaner", predictors()), ('vectorizer', vectorizer), ('classifier', classifier)])

BNBmodel.fit(X_train,y_train)
BNBpred=BNBmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,BNBpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,BNBpred)}')
print(f'Accuracy: {accuracy_score(y_test,BNBpred)*100}%')
pickle.dump(BNBmodel, open('BernoulliNB_model.sav', 'wb'))
print('Bernoulli NB trained Model Saved')

Confusion Matrix:
[[173  22]
 [ 32 173]]

Classification Report:
              precision    recall  f1-score   support

         NEG       0.84      0.89      0.87       195
         POS       0.89      0.84      0.87       205

    accuracy                           0.86       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.86      0.86       400

Accuracy: 86.5%
Bernoulli NB trained Model Saved


## **Multinomial Naïve Bayes (MNB)**

In [27]:
from sklearn.naive_bayes import MultinomialNB



In [28]:
classifier = MultinomialNB()
MNBmodel = Pipeline([("cleaner", predictors()), ('vectorizer', vectorizer), ('classifier', classifier)])

MNBmodel.fit(X_train,y_train)
MNBpred=BNBmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,MNBpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,MNBpred)}')
print(f'Accuracy: {accuracy_score(y_test,MNBpred)*100}%')
pickle.dump(MNBmodel, open('MultinomialNB_model.sav', 'wb'))
print('Multinomial NB trained Model Saved')

Confusion Matrix:
[[173  22]
 [ 32 173]]

Classification Report:
              precision    recall  f1-score   support

         NEG       0.84      0.89      0.87       195
         POS       0.89      0.84      0.87       205

    accuracy                           0.86       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.86      0.86       400

Accuracy: 86.5%
Multinomial NB trained Model Saved


## **Maximum Entropy (ME)**

In [29]:
from sklearn.linear_model import LogisticRegression


In [30]:
classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()), ('vectorizer', vectorizer), ('classifier', classifier)])

# Train the Model
LRmodel.fit(X_train,y_train)   
LRpred = LRmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,LRpred)}')
print(f'Accuracy: {accuracy_score(y_test,LRpred)*100}%')
pickle.dump(LRmodel, open('LinearRegression_model.sav', 'wb'))
print('Logistic Regression trained Model Saved')

Confusion Matrix:
[[169  26]
 [ 34 171]]

Classification Report:
              precision    recall  f1-score   support

         NEG       0.83      0.87      0.85       195
         POS       0.87      0.83      0.85       205

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400

Accuracy: 85.0%
Logistic Regression trained Model Saved


In [31]:
# Another random review
pre = LRmodel.predict(["Production has an incredibly important place to shoot a series or film. Sometimes even a very minimalist story can reach an incredibly successful point after the right production stages. The Witcher series is far from minimalist. The Witcher is one of the best Middle-earth works in the world. Production quality is essential if you want to handle such a topic successfully."])
print(f'Prediction: {pre[0]}')

Prediction: POS


## **Decision Tree (DE)**

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [33]:
dt_clf = DecisionTreeClassifier()
dt_model = Pipeline([("cleaner", predictors()), ('vectorizer', vectorizer), ('classifier', dt_clf)])

# Train the Model
dt_model.fit(X_train,y_train)   
dt_pred = dt_model.predict(X_test)

In [34]:
print(f'Confusion Matrix:\n{confusion_matrix(y_test,dt_pred)}')
print(f'\nClassification Report:\n{classification_report(y_test,dt_pred)}')
print(f'Accuracy: {accuracy_score(y_test,dt_pred)*100}%')
pickle.dump(dt_model, open('DecisionTree_model.sav', 'wb'))
print('Decision Tree trained Model Saved')

Confusion Matrix:
[[167  28]
 [ 34 171]]

Classification Report:
              precision    recall  f1-score   support

         NEG       0.83      0.86      0.84       195
         POS       0.86      0.83      0.85       205

    accuracy                           0.84       400
   macro avg       0.85      0.85      0.84       400
weighted avg       0.85      0.84      0.85       400

Accuracy: 84.5%
Decision Tree trained Model Saved


## **Support Vector Machine (SVM)**

In [None]:
from sklearn.svm import SVC
# pipeline and standard scalar are already imported

## **Results comparision plots and tables**