In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
ds = pd.read_csv("Restaurant_Reviews.tsv", delimiter = "\t", quoting = 3)

In [3]:
import re
import nltk

In [4]:
from nltk.stem.porter import PorterStemmer as ps

In [5]:
from nltk.corpus import stopwords

In [6]:
ps = ps()

In [7]:
corpus = []

In [8]:
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', " ", ds["Review"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer as cv

In [10]:
cv = cv(max_features = 1500)
x = cv.fit_transform(corpus).toarray()
y = ds.iloc[:, 1].values

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)

Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier as rfc

In [14]:
classifier = rfc(n_estimators = 10, criterion = "entropy", random_state = 0)

In [15]:
classifier.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [16]:
ypred = classifier.predict(xtest)
print(ypred)

[0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0
 0 0 0 1 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]


In [17]:
from sklearn.metrics import confusion_matrix as cm

In [18]:
cm = cm(ytest, ypred)

In [19]:
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]

In [20]:
def correctness(TN,FP,FN,TP):
    Accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
    Precision = TP / (TP + FP) * 100
    Recall = TP / (TP + FN) * 100
    F1_Score = 2 * Precision * Recall / (Precision + Recall)
    
    print("accuracy is:",int(Accuracy),"%")
    print("Precision is:",int(Precision),"%")
    print("Recall is:",int(Recall),"%")
    print("F1_Score is:",int(F1_Score),"%")

In [21]:
correctness(TN,FP,FN,TP)

accuracy is: 72 %
Precision is: 85 %
Recall is: 55 %
F1_Score is: 67 %
