In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [124]:
def create_dataframe(filename):
    try:
        df = pd.read_csv(filename, delimiter = '\t', quoting = 3)
    except:
        print("The file could not be loaded.")
    finally:
        print("A DataFrame object was successfully created.")
        return df

In [125]:
df = create_dataframe('Restaurant_Reviews.tsv')
df

A DataFrame object was successfully created.


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [126]:
# Cleaning up the text
# Import the necessary libraries
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [127]:
def clean_text(review):
    review = re.sub('[^a-zA-Z]', ' ', review).lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    return review

In [128]:
def create_corpus(df):
    corpus = []
    for i in df['Review']:
        review = clean_text(i)
        corpus.append(review)
    return corpus

In [129]:
corpus = create_corpus(df)

In [130]:
# Creating the sparse matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = df['Liked'].values

In [131]:
# Train and Test The Machine Learning Model
# We will use try using the Naive Bayes Classifier

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [133]:
y_pred = classifier.predict(X_test)

In [134]:
cm = confusion_matrix(y_test, y_pred)

In [135]:
print(cm)
print('The accuracy of the model is: ', (cm[0][0]+cm[1][1])/len(X_test))

[[55 42]
 [12 91]]
The accuracy of the model is:  0.73


In [161]:
# TRY OUT THE MODEL!
# ADD YOUR OWN REVIEW BY EXECUTING THE CELL BELOW!
# NOTE that the model was trained on a set of 800 reviews. The more reviews for training the better the predictions will be.

In [191]:
user_review = cv.transform([clean_text(input('Please enter your review here: '))]).toarray()

Please enter your review here: An absolute disaster!


In [192]:
model_prediction = classifier.predict(user_review)
if(model_prediction[0] == 0):
    print('The review was NEGATIVE.')
else:
    print('The review was POSITIVE.')

The review was NEGATIVE.
