In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import nltk
%matplotlib inline

In [2]:
pd.set_option('display.max_columns',50)

In [3]:
df = pd.read_csv("Restaurant_Review_Data.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Review,Ratings
0,0,Incredibly stingy and disappointing sushi (pic...,1
1,1,Price is super high for the food and service.,1
2,2,"I love this place but if you go there, you sho...",2
3,3,The atmosphere & service were outstanding. I h...,5
4,4,"Menu was smaller than. Lunch menu, food was am...",5


In [5]:
df = df[['Review','Ratings']]

In [6]:
#shape of the dataset
df.shape

(81923, 2)

In [7]:
#general information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81923 entries, 0 to 81922
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Review   81923 non-null  object
 1   Ratings  81923 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [8]:
# Placing the ratings into 2 categories : Positive & Negative 

df['Ratings'] = df['Ratings'].replace(to_replace = {1,2},value = 'Negative')
df['Ratings'] = df['Ratings'].replace(to_replace = {3,4,5},value = 'Positive')

In [9]:
#Checking if dataset is balanced/not balanced
df['Ratings'].value_counts()

Negative    41695
Positive    40228
Name: Ratings, dtype: int64

In [10]:
#Checking for the presence of null values
df.isnull().sum()

Review     0
Ratings    0
dtype: int64

In [11]:
#Checking for the presence of duplicate rows
df[df.duplicated()]

Unnamed: 0,Review,Ratings
2338,Great food and extra friendly service.,Positive
4516,Great place,Positive
6293,Great food and service,Positive
10246,Anniversary,Positive
11282,Awesome as always!,Positive
...,...,...
79987,It was amazing,Positive
80033,Dave's Famous Reuben\nVery poor quality with n...,Negative
80875,Great overall experience,Positive
80992,Great service,Positive


In [12]:
# There are a total of 418 rows which are duplicate and have to be removed.
df.duplicated().sum()

185

In [13]:
df = df.drop_duplicates()

In [14]:
df = df.reset_index(drop = True)

In [15]:
df.shape

(81738, 2)

In [16]:
# The reviews contain HTML tags and punctuation marks.

In [17]:
#Removing the HTML tags
for i in range(0,df.shape[0]):
    soup = BeautifulSoup(df['Review'][i])
    df['Review'][i] = soup.get_text()



In [18]:
#Removing the punctuation marks
for i in range(0,df.shape[0]):
    df['Review'][i] = re.sub(r'[^\w\s]',' ',df['Review'][i])

In [19]:
#Making the review into lowercase
df['Review'] = df['Review'].apply(str.lower)

In [20]:
#When webscraping was done, reviews of all languages were taken. Hence, it is important to filter out the english reviews.
from langdetect import detect 

In [21]:
def language_detection(s):
    return detect(s)

In [22]:
df['Language_of_Review'] = df['Review'].apply(language_detection)

In [23]:
df['Language_of_Review'].value_counts()

en       80892
es         223
fr         100
it          77
af          68
de          53
ro          45
ca          40
so          27
nl          25
da          25
pt          24
no          24
cy          21
tl          13
ko           9
sw           8
et           8
hr           7
tr           7
sv           6
sl           6
vi           4
fi           4
hu           4
zh-cn        4
id           3
sk           3
sq           2
pl           2
ja           2
cs           1
lt           1
Name: Language_of_Review, dtype: int64

In [24]:
df = df[df['Language_of_Review'] == "en"]

In [25]:
df.shape

(80892, 3)

In [26]:
# The dataset is still almost balanced after filtering of the reviews.
df['Ratings'].value_counts()

Negative    41492
Positive    39400
Name: Ratings, dtype: int64

In [27]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [28]:
def removal_of_stop_words(s):
    l = []
    tokens = nltk.word_tokenize(s)
    for i in tokens: 
        if i not in sw:
            l.append(i)
    return ' '.join(l)

In [29]:
#Removing the stop words from the sentence    
df['Review'] = df['Review'].apply(removal_of_stop_words)

In [30]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [31]:
def lemmatization(s):
    l = []
    tokens = nltk.word_tokenize(s)
    for i in tokens: 
        l.append(wl.lemmatize(i))
    return ' '.join(l)

In [32]:
df['Review'] = df['Review'].apply(lemmatization)

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['Review'],df['Ratings'],test_size = 0.2, random_state = 60,stratify = df['Ratings'])

In [34]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
vec = CountVectorizer()
vec.fit(X_train)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [37]:
print(X_train.shape)

(64713, 40868)


In [38]:
print(X_test.shape)

(16179, 40868)


In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [40]:
model = MultinomialNB()
parameters = {"alpha":[10**-2,10**-1,10**0,10**1,10**2]}
grid_search = GridSearchCV(model,param_grid = parameters,scoring = "accuracy",cv = 5, n_jobs = 2, return_train_score = True)

In [41]:
grid_search.fit(X_train,y_train)

  return f(**kwargs)


GridSearchCV(cv=5, estimator=MultinomialNB(), n_jobs=2,
             param_grid={'alpha': [0.01, 0.1, 1, 10, 100]},
             return_train_score=True, scoring='accuracy')

In [42]:
print(grid_search.best_score_)
print(grid_search.best_estimator_)

0.8684344643781425
MultinomialNB(alpha=1)


In [43]:
model2 = RandomForestClassifier()

In [44]:
parameters = {"n_estimators":[100,150,200],"max_depth":[1,2,3,4,5,6]}
grid_tree = GridSearchCV(model2,param_grid = parameters,cv = 10, n_jobs = -1)
grid_tree.fit(X_train,y_train)

  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6],
                         'n_estimators': [100, 150, 200]})

In [45]:
print(grid_tree.best_params_)
print(grid_tree.best_score_)

{'max_depth': 5, 'n_estimators': 200}
0.8109809353855816


In [46]:
model3 = AdaBoostClassifier()

In [47]:
model3.fit(X_train,y_train)
pred = model3.predict(X_test)
print(accuracy_score(pred,y_test))

  return f(**kwargs)


0.8235366833549663


parameters = {"n_estimators":[100,150,200],"learning_rate":[0.6,0.8,1.0],"algorithm":['SAMME', 'SAMME.R']}
grid_ada = GridSearchCV(model3,param_grid = parameters,cv = 10, n_jobs = -1)
grid_ada.fit(X_train,y_train)

print(grid_ada.best_params_)
print(grid_ada.best_score_)

In [48]:
model4 = GradientBoostingClassifier()

In [49]:
model4.fit(X_train,y_train)
pred = model4.predict(X_test)
print(accuracy_score(pred,y_test))

  return f(**kwargs)


0.8396687063477347


parameters = {"n_estimators":[100,150,200],"learning_rate":[0.1,0.2,0.3],"criterion":['friedman_mse', 'squared_error']}
grid_gradient = GridSearchCV(model4,param_grid = parameters,cv = 10, n_jobs = -1)
grid_gradient.fit(X_train,y_train)

print(grid_gradient.best_params_)
print(grid_gradient.best_score_)

In [51]:
final_pred = grid_search.predict(X_test)

In [55]:
from sklearn.metrics import classification_report,confusion_matrix

In [56]:
print(classification_report(y_test,final_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      8299
           1       0.86      0.88      0.87      7880

    accuracy                           0.87     16179
   macro avg       0.87      0.87      0.87     16179
weighted avg       0.87      0.87      0.87     16179



In [63]:
model.fit(X_train,y_train)
p = model.predict(X_test)
print(accuracy_score(y_test,p))

0.8676679646455282


  return f(**kwargs)


In [64]:
from joblib import dump
dump(vec,'count_vectorizer.joblib')
dump(model,'model.joblib')

['model.joblib']