In [None]:
# ML Models Used:
# Naive Bayesian
# Regression
# SVM
# Random Forest
# VADER

In [None]:
# Datasets Used:
# https://www.kaggle.com/datasets/joebeachcapital/restaurant-reviews
# https://www.kaggle.com/datasets/farukalam/yelp-restaurant-reviews

# **Data Preprocessing**

In [None]:
import pandas as pd
df1 = pd.read_csv("Yelp Restaurant Reviews.csv")
df2 = pd.read_csv("Restaurant reviews.csv")

In [None]:
df1.info()
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19896 entries, 0 to 19895
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Yelp URL     19896 non-null  object
 1   Rating       19896 non-null  int64 
 2   Date         19896 non-null  object
 3   Review Text  19896 non-null  object
dtypes: int64(1), object(3)
memory usage: 621.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Restaurant  10000 non-null  object 
 1   Reviewer    9962 non-null   object 
 2   Review      9955 non-null   object 
 3   Rating      9962 non-null   object 
 4   Metadata    9962 non-null   object 
 5   Time        9962 non-null   object 
 6   Pictures    10000 non-null  int64  
 7   7514        1 non-null      float64
dtypes: float64(1), int64(1), object(6)
memory usage: 625.1+ KB


In [None]:
df2.head(10)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,
5,Beyond Flavours,Nagabhavani K,"Ambiance is good, service is good, food is aPr...",5,1 Review,5/24/2019 15:22,0,
6,Beyond Flavours,Jamuna Bhuwalka,"Its a very nice place, ambience is different, ...",5,1 Review,5/24/2019 1:02,0,
7,Beyond Flavours,Sandhya S,Well after reading so many reviews finally vis...,4,1 Review,5/23/2019 15:01,0,
8,Beyond Flavours,Akash Thorat,"Excellent food , specially if you like spicy f...",5,"1 Review , 1 Follower",5/22/2019 23:12,0,
9,Beyond Flavours,Smarak Patnaik,Came for the birthday treat of a close friend....,5,"1 Review , 1 Follower",5/22/2019 22:37,0,


Data Cleaning

In [None]:
# Feature Reduction (drop unecessary columns)
df2 = df2.drop(columns=['Restaurant', 'Reviewer', 'Metadata', 'Time', 'Pictures', '7514'])
df1 = df1.drop(columns=['Yelp URL', 'Date'])

In [None]:
# Make both dataframes have same column names so they can be combined
df1.rename(columns={'Review Text' : 'Review'}, inplace=True)
df = pd.concat([df1, df2])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29896 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rating  29858 non-null  object
 1   Review  29851 non-null  object
dtypes: object(2)
memory usage: 700.7+ KB


In [None]:
df.Rating.value_counts()

Rating
5       10883
4        4447
5        3832
4        2373
3        2069
1        1735
2        1280
1        1217
3        1193
2         684
4.5        69
3.5        47
2.5        19
1.5         9
Like        1
Name: count, dtype: int64

In [None]:
# Convert all Rating values to whole numbers by applying floor function on Rating
import numpy as np
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating'] = df['Rating'].apply(np.floor)

In [None]:
# Convert numeric ratings to discrete sentiment labels
# 1-2 => negative (neg)
# 3 => neutral (neu)
# 4-5 => positive (pos)
df['Rating'].mask(df['Rating'] == 1.0, 'neg', inplace=True)
df['Rating'].mask(df['Rating'] == 2.0, 'neg', inplace=True)
df['Rating'].mask(df['Rating'] == 3.0, 'neu', inplace=True)
df['Rating'].mask(df['Rating'] == 4.0, 'pos', inplace=True)
df['Rating'].mask(df['Rating'] == 5.0, 'pos', inplace=True)
df.head(10)

Unnamed: 0,Rating,Review
0,pos,All I can say is they have very good ice cream...
1,pos,Nice little local place for ice cream.My favor...
2,pos,A delicious treat on a hot day! Staff was very...
3,pos,This was great service and a fun crew! I got t...
4,pos,This is one of my favorite places to get ice c...
5,pos,I've been coming to this ice cream stand since...
6,neg,The soft-serve is way too sweet and has a stra...
7,pos,My husband and I stopped here on the way back ...
8,pos,"Little piece of heaven, the best ice cream sho..."
9,neg,Went here for the 1st time tonight. I ordered ...


In [None]:
# Clean data of null and duplicates
df = df.dropna().drop_duplicates()
df.Rating.value_counts()

Rating
pos    21073
neg     4914
neu     3290
Name: count, dtype: int64

In [None]:
# Uniformly distribute data (each label has equal number of occurances)
# Helps models better identify neutral instances
n = df['Rating'].value_counts().min()
df = df.groupby('Rating').head(n)
df.Rating.value_counts()

Rating
pos    3290
neg    3290
neu    3290
Name: count, dtype: int64

Basic Indexing and TFIDF Vectorization

In [None]:
import nltk

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
dfp = df
dfp.head(10)

Unnamed: 0,Rating,Review
0,pos,All I can say is they have very good ice cream...
1,pos,Nice little local place for ice cream.My favor...
2,pos,A delicious treat on a hot day! Staff was very...
3,pos,This was great service and a fun crew! I got t...
4,pos,This is one of my favorite places to get ice c...
5,pos,I've been coming to this ice cream stand since...
6,neg,The soft-serve is way too sweet and has a stra...
7,pos,My husband and I stopped here on the way back ...
8,pos,"Little piece of heaven, the best ice cream sho..."
9,neg,Went here for the 1st time tonight. I ordered ...


In [None]:
dfp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9870 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rating  9870 non-null   object
 1   Review  9870 non-null   object
dtypes: object(2)
memory usage: 231.3+ KB


In [None]:
dfp['Review'] = dfp['Review'].astype(str)

In [None]:
# Tokenizer to overwrite sklearn's default tokenizer in its TfidfVectorizer
# Default tokenizer does not perform stemming or lemmitization, ours does
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# Adpated From: https://gist.github.com/4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55

In [None]:
tokenizer = LemmaTokenizer()

In [None]:
# Covert reviews to TFIDF vectors
# TfidfVectorizer automatically performs full pipeline (tokenization, stopword removal, lemmatization, word weighing)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', tokenizer=tokenizer)
#tfidf = TfidfVectorizer()
preprocessed = tfidf.fit_transform(dfp['Review'])



In [None]:
print(preprocessed.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.1099739  0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.03263081 0.         0.         ... 0.         0.         0.        ]]


# **Model Training and Testing**

In [None]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X = preprocessed
y = dfp['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=30)

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import time

Generating Our Own Models

In [None]:
# Complement Naive Bayesian model
CNB = ComplementNB()

start = time.time()
CNB.fit(X_train, y_train)
stop = time.time()
training_time = stop - start

cnb_prediction = CNB.predict(X_test)
cnb_accuracy = metrics.accuracy_score(cnb_prediction, y_test)

print('------------------------------------------------')
print('Complement Naive Bayesian Perfomance')
print('------------------------------------------------')
print('Accuracy: ',str('{:04.2f}'.format(cnb_accuracy*100))+'%')
print(f'Training Time: {training_time}s')
print('\nConfusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, cnb_prediction)))
print('\nClassification Report:')
print(classification_report(y_test, cnb_prediction))

------------------------------------------------
Complement Naive Bayesian Perfomance
------------------------------------------------
Accuracy:  73.10%
Training Time: 0.14009594917297363s

Confusion Matrix:
     0    1    2
0  466  134   59
1  140  341  181
2   14    3  636

Classification Report:
              precision    recall  f1-score   support

         neg       0.75      0.71      0.73       659
         neu       0.71      0.52      0.60       662
         pos       0.73      0.97      0.83       653

    accuracy                           0.73      1974
   macro avg       0.73      0.73      0.72      1974
weighted avg       0.73      0.73      0.72      1974



In [None]:
#SVM Model
SVM = svm.SVC(kernel='linear')

start = time.time()
SVM.fit(X_train, y_train)
stop = time.time()
training_time = stop - start

svm_predictions = SVM.predict(X_test)
svm_accuracy = metrics.accuracy_score(svm_predictions, y_test)

print('------------------------------------------------')
print('SVM Perfomance')
print('------------------------------------------------')
print('Accuracy: ',str('{:04.2f}'.format(svm_accuracy*100))+'%')
print(f'Training Time: {training_time}s')
print('\nConfusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, svm_predictions)))
print('\nClassification Report:')
print(classification_report(y_test, svm_predictions))

------------------------------------------------
SVM Perfomance
------------------------------------------------
Accuracy:  76.09%
Training Time: 18.11628532409668s

Confusion Matrix:
     0    1    2
0  487  154   18
1  155  422   85
2   15   45  593

Classification Report:
              precision    recall  f1-score   support

         neg       0.74      0.74      0.74       659
         neu       0.68      0.64      0.66       662
         pos       0.85      0.91      0.88       653

    accuracy                           0.76      1974
   macro avg       0.76      0.76      0.76      1974
weighted avg       0.76      0.76      0.76      1974



In [None]:
# Logistic Regression Model
log_reg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')

start = time.time()
log_reg.fit(X_train, y_train)
stop = time.time()
training_time = stop - start

lr_predictions = log_reg.predict(X_test)
lr_accuracy = metrics.accuracy_score(lr_predictions, y_test)

print('------------------------------------------------')
print('Logistic Regression Perfomance')
print('------------------------------------------------')
print('Accuracy: ',str('{:04.2f}'.format(lr_accuracy*100))+'%')
print(f'Training Time: {training_time}s')
print('\nConfusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, lr_predictions)))
print('\nClassification Report:')
print(classification_report(y_test, lr_predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


------------------------------------------------
Logistic Regression Perfomance
------------------------------------------------
Accuracy:  76.24%
Training Time: 7.447905540466309s

Confusion Matrix:
     0    1    2
0  494  145   20
1  152  423   87
2   19   46  588

Classification Report:
              precision    recall  f1-score   support

         neg       0.74      0.75      0.75       659
         neu       0.69      0.64      0.66       662
         pos       0.85      0.90      0.87       653

    accuracy                           0.76      1974
   macro avg       0.76      0.76      0.76      1974
weighted avg       0.76      0.76      0.76      1974



In [None]:
# Random Forest Model
rfc = RandomForestClassifier()

start = time.time()
rfc.fit(X_train, y_train)
stop = time.time()
training_time = stop - start

rfc_predictions = rfc.predict(X_test)
rfc_accuracy = metrics.accuracy_score(rfc_predictions, y_test)

print('------------------------------------------------')
print('Random Forest Perfomance')
print('------------------------------------------------')
print('Accuracy: ',str('{:04.2f}'.format(rfc_accuracy*100))+'%')
print(f'Training Time: {training_time}s')
print('\nConfusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, rfc_predictions)))
print('\nClassification Report:')
print(classification_report(y_test, rfc_predictions))

------------------------------------------------
Random Forest Perfomance
------------------------------------------------
Accuracy:  72.49%
Training Time: 19.61721444129944s

Confusion Matrix:
     0    1    2
0  472  139   48
1  171  361  130
2   19   36  598

Classification Report:
              precision    recall  f1-score   support

         neg       0.71      0.72      0.71       659
         neu       0.67      0.55      0.60       662
         pos       0.77      0.92      0.84       653

    accuracy                           0.72      1974
   macro avg       0.72      0.73      0.72      1974
weighted avg       0.72      0.72      0.72      1974



Using NLTK's VADER Analyzer (Rules Based Model)

In [None]:
# Generate score dictionary for every review
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
dft = df
dft['scores'] = dft['Review'].apply(lambda Rating: sid.polarity_scores(Rating))
dft.head(10)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,Rating,Review,scores
0,pos,All I can say is they have very good ice cream...,"{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'comp..."
1,pos,Nice little local place for ice cream.My favor...,"{'neg': 0.022, 'neu': 0.66, 'pos': 0.318, 'com..."
2,pos,A delicious treat on a hot day! Staff was very...,"{'neg': 0.0, 'neu': 0.651, 'pos': 0.349, 'comp..."
3,pos,This was great service and a fun crew! I got t...,"{'neg': 0.02, 'neu': 0.752, 'pos': 0.227, 'com..."
4,pos,This is one of my favorite places to get ice c...,"{'neg': 0.027, 'neu': 0.73, 'pos': 0.243, 'com..."
5,pos,I've been coming to this ice cream stand since...,"{'neg': 0.0, 'neu': 0.709, 'pos': 0.291, 'comp..."
6,neg,The soft-serve is way too sweet and has a stra...,"{'neg': 0.152, 'neu': 0.716, 'pos': 0.132, 'co..."
7,pos,My husband and I stopped here on the way back ...,"{'neg': 0.058, 'neu': 0.673, 'pos': 0.27, 'com..."
8,pos,"Little piece of heaven, the best ice cream sho...","{'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'comp..."
9,neg,Went here for the 1st time tonight. I ordered ...,"{'neg': 0.057, 'neu': 0.861, 'pos': 0.082, 'co..."


In [None]:
# Compund score dictionary into a single value
dft['compound'] = dft['scores'].apply(lambda score_dict: score_dict['compound'])
dft.head(10)

Unnamed: 0,Rating,Review,scores,compound
0,pos,All I can say is they have very good ice cream...,"{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'comp...",0.8803
1,pos,Nice little local place for ice cream.My favor...,"{'neg': 0.022, 'neu': 0.66, 'pos': 0.318, 'com...",0.9509
2,pos,A delicious treat on a hot day! Staff was very...,"{'neg': 0.0, 'neu': 0.651, 'pos': 0.349, 'comp...",0.8803
3,pos,This was great service and a fun crew! I got t...,"{'neg': 0.02, 'neu': 0.752, 'pos': 0.227, 'com...",0.94
4,pos,This is one of my favorite places to get ice c...,"{'neg': 0.027, 'neu': 0.73, 'pos': 0.243, 'com...",0.9924
5,pos,I've been coming to this ice cream stand since...,"{'neg': 0.0, 'neu': 0.709, 'pos': 0.291, 'comp...",0.9949
6,neg,The soft-serve is way too sweet and has a stra...,"{'neg': 0.152, 'neu': 0.716, 'pos': 0.132, 'co...",0.0288
7,pos,My husband and I stopped here on the way back ...,"{'neg': 0.058, 'neu': 0.673, 'pos': 0.27, 'com...",0.9353
8,pos,"Little piece of heaven, the best ice cream sho...","{'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'comp...",0.9406
9,neg,Went here for the 1st time tonight. I ordered ...,"{'neg': 0.057, 'neu': 0.861, 'pos': 0.082, 'co...",0.4348


In [None]:
# Covert compunded score into discrete labels
# -1 -- -0.05 => neg
# -0.05 -- 0.05 => neu
# 0.05 -- 1 => pos
def predict(x):
  if x > 0.05:
    return 'pos'
  elif x < -0.05:
    return 'neg'
  return 'neu'

# Apply function to convert VADER's compund score into a prediction (using label)
dft['pred_rating'] = dft['compound'].apply(predict)
dft.head(10)

Unnamed: 0,Rating,Review,scores,compound,pred_rating
0,pos,All I can say is they have very good ice cream...,"{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'comp...",0.8803,pos
1,pos,Nice little local place for ice cream.My favor...,"{'neg': 0.022, 'neu': 0.66, 'pos': 0.318, 'com...",0.9509,pos
2,pos,A delicious treat on a hot day! Staff was very...,"{'neg': 0.0, 'neu': 0.651, 'pos': 0.349, 'comp...",0.8803,pos
3,pos,This was great service and a fun crew! I got t...,"{'neg': 0.02, 'neu': 0.752, 'pos': 0.227, 'com...",0.94,pos
4,pos,This is one of my favorite places to get ice c...,"{'neg': 0.027, 'neu': 0.73, 'pos': 0.243, 'com...",0.9924,pos
5,pos,I've been coming to this ice cream stand since...,"{'neg': 0.0, 'neu': 0.709, 'pos': 0.291, 'comp...",0.9949,pos
6,neg,The soft-serve is way too sweet and has a stra...,"{'neg': 0.152, 'neu': 0.716, 'pos': 0.132, 'co...",0.0288,neu
7,pos,My husband and I stopped here on the way back ...,"{'neg': 0.058, 'neu': 0.673, 'pos': 0.27, 'com...",0.9353,pos
8,pos,"Little piece of heaven, the best ice cream sho...","{'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'comp...",0.9406,pos
9,neg,Went here for the 1st time tonight. I ordered ...,"{'neg': 0.057, 'neu': 0.861, 'pos': 0.082, 'co...",0.4348,pos


In [None]:
# Checks how well VADER performed on our data
vader_accuracy = metrics.accuracy_score(dft['Rating'], dft['pred_rating'])

print('------------------------------------------------')
print('VADER Perfomance')
print('------------------------------------------------')
print('Accuracy: ',str('{:04.2f}'.format(vader_accuracy*100))+'%')
print('\nConfusion Matrix:')
print(pd.DataFrame(confusion_matrix(dft['Rating'], dft['pred_rating'])))
print('\nClassification Report:')
print(classification_report(dft['Rating'], dft['pred_rating']))

------------------------------------------------
VADER Perfomance
------------------------------------------------
Accuracy:  49.35%

Confusion Matrix:
      0    1     2
0  1519  140  1631
1   486  157  2647
2    72   23  3195

Classification Report:
              precision    recall  f1-score   support

         neg       0.73      0.46      0.57      3290
         neu       0.49      0.05      0.09      3290
         pos       0.43      0.97      0.59      3290

    accuracy                           0.49      9870
   macro avg       0.55      0.49      0.42      9870
weighted avg       0.55      0.49      0.42      9870



# **Pickling Vectorizer and Models**

In [None]:
import pickle

# pickling the vectorizer
pickle.dump(tfidf, open('tfidf_vectorizer.sav', 'wb'))

# pickling the models
pickle.dump(CNB, open('CNB_model.sav', 'wb'))
pickle.dump(SVM, open('svm_model.sav', 'wb'))
pickle.dump(log_reg, open('regression_model.sav', 'wb'))
pickle.dump(rfc, open('rfc_model.sav', 'wb'))

# **Model Demo**

In [None]:
# DEMO
# Input your own text (review) and see how each model classifies it

# Open models and TFIDF vectorizer from pickled .sav files
vectorizer = pickle.load(open('tfidf_vectorizer.sav', 'rb'))
CNB_demo = pickle.load(open('CNB_model.sav', 'rb'))
reg_demo = pickle.load(open('regression_model.sav', 'rb'))
svm_demo = pickle.load(open('svm_model.sav', 'rb'))
rfc_demo = pickle.load(open('rfc_model.sav', 'rb'))

# Take in user input an generate predictions if input is not null
text = str(input("Type text and hit [ENTER]:\n"))

if text == None:
  print("ERROR: NO TEXT ENTERED")
else:
  text_vector = vectorizer.transform([text])
  CNB_result = CNB_demo.predict(text_vector)
  reg_result = reg_demo.predict(text_vector)
  svm_result = svm_demo.predict(text_vector)
  rfc_result = rfc_demo.predict(text_vector)
  vader_result = predict(sid.polarity_scores(text)['compound'])

  print('\n')
  print('------------------------------------------------')
  print("Predictions")
  print('------------------------------------------------')
  print(f"Complement Naive Bayes: {CNB_result[0]}")
  print(f"Logistic Regression: {reg_result[0]}")
  print(f"SVM: {svm_result[0]}")
  print(f"Random Forest: {CNB_result[0]}")
  print(f"VADER: {vader_result}")

Type text and hit [ENTER]:
Lol @u


------------------------------------------------
Predictions
------------------------------------------------
Complement Naive Bayes: pos
Logistic Regression: neg
SVM: pos
Random Forest: pos
VADER: pos


