In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [8]:
dataset['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

## Cleaning text

In [11]:
import re
import nltk
nltk.download('stopwords')  # for removing all stopwrods as they does not contain any value
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # for stemming 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
corpus = []
for i in range(len(dataset)):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  allStopWord = stopwords.words('english')
  allStopWord.remove('not')
  allStopWord.remove("aren't")
  allStopWord.remove("isn't")
  review = [ps.stem(word) for word in review if not word in set(allStopWord)]
  review = ' '.join(review)
  corpus.append(review)

In [38]:
corpus[:4]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love']

## Creating a Bag of Words model

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Liked']

In [53]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [40]:
len(X[0])

1500

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [77]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [78]:
prediction = classifier.predict(X_test)

In [79]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

[[55 47]
 [19 79]]
              precision    recall  f1-score   support

           0       0.74      0.54      0.62       102
           1       0.63      0.81      0.71        98

    accuracy                           0.67       200
   macro avg       0.69      0.67      0.67       200
weighted avg       0.69      0.67      0.66       200



In [82]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

[[87 15]
 [18 80]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       102
           1       0.84      0.82      0.83        98

    accuracy                           0.83       200
   macro avg       0.84      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200



##New Positive review

In [83]:
def newReview(new_review):
  new_review = re.sub('[^a-zA-Z]', ' ', new_review)
  new_review = new_review.lower()
  new_review = new_review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
  new_review = ' '.join(new_review)
  new_corpus = [new_review]
  new_X_test = cv.transform(new_corpus).toarray()
  new_y_pred = classifier.predict(new_X_test)
  print(new_y_pred)

In [84]:
newReview('I hate this restaurant so much')
newReview('I love this restaurant so much')

[0]
[1]
