Data proprocessing

In [54]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [55]:
ds = pd.read_csv("Restaurant_Reviews.tsv", delimiter = "\t", quoting = 3)

NLP

In [56]:
import re
import nltk

In [57]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Sudeep
[nltk_data]     nellur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
from nltk.stem.porter import PorterStemmer as ps

In [59]:
from nltk.corpus import stopwords

In [60]:
ps = ps()

In [61]:
corpus = []

In [62]:
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', " ", ds["Review"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

Creating the Bag of Words model

In [63]:
from sklearn.feature_extraction.text import CountVectorizer as cv

In [64]:
cv = cv(max_features = 1500)
x = cv.fit_transform(corpus).toarray()
y = ds.iloc[:, 1].values

Spliting

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)

Naive Bayes

In [73]:
from sklearn.naive_bayes import GaussianNB as gnb

In [74]:
classifier = gnb()
classifier.fit(xtrain, ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

Prediction

In [76]:
ypred = classifier.predict(xtest)

In [77]:
ypred

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1], dtype=int64)

Confusion Matrix

In [82]:
from sklearn.metrics import confusion_matrix as cm

In [83]:
cm = cm(ytest, ypred)

In [84]:
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [85]:
print(cm[0][0])

55


In [86]:
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]

In [111]:
def correctness(TN,FP,FN,TP):
    Accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
    Precision = TP / (TP + FP) * 100
    Recall = TP / (TP + FN) * 100
    F1_Score = 2 * Precision * Recall / (Precision + Recall)
    
    print("accuracy is:",int(Accuracy),"%")
    print("Precision is:",int(Precision),"%")
    print("Recall is:",int(Recall),"%")
    print("F1_Score is:",int(F1_Score),"%")

In [112]:
correctness(TN,FP,FN,TP)

accuracy is: 73 %
Precision is: 68 %
Recall is: 88 %
F1_Score is: 77 %
