## Introduction

Yelp is an online platform that publishes crowd-sourced reviews about businesses. Yelp offers a space where users could reliably communicate experiences in order to make informed decisions.

## 1: Reading in the Yelp Reviews

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from textblob import TextBlob, Word
import nltk
import tensorflow as tf
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [3]:
# read yelp.csv into a DataFrame
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [93]:

yelp = pd.read_csv('yelp.csv')
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [4]:
yelp.shape

(10000, 10)

In [94]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
yelp_best_worst.shape

(4086, 10)

In [5]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [6]:
# split the new DataFrame into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
X_train.shape

(3064,)

In [9]:
X_test.shape

(1022,)

## 2: Tokenization: Separate text into units such as sentences or words

In [10]:
X_train

6841    FILLY-B's!!!!!  only 8 reviews?? NINE now!!!\n...
1728    My husband and I absolutely LOVE this restaura...
3853    We went today after lunch. I got my usual of l...
671     Totally dissapointed.  I had purchased a coupo...
4920    Costco Travel - My husband and I recently retu...
                              ...                        
9396    Pros: \n-No breed restrictions on dogs\n-Washe...
2661    Sorry Banana Leaf... I'm usually not picky at ...
9756    Alright this is the deal of deals, 2.75 for st...
554     Hands down a great lil joint! Gotta get the gu...
2575    Absolutely disgusting.  I had enchiladas and a...
Name: text, Length: 3064, dtype: object

In [7]:
# use CountVectorizer to create document-term matrices from X_train and X_test

vect = CountVectorizer(stop_words='english', lowercase=True, min_df=20)

In [8]:
#Tokenisation and Vectorisation

vect.fit(X_train)

In [9]:
X_train_dtm = vect.transform(X_train)
demo = pd.DataFrame(X_train_dtm.toarray())
demo.columns = vect.get_feature_names_out()
demo

Unnamed: 0,00,10,100,11,12,13,14,15,16,18,...,year,years,yelp,yes,yesterday,yogurt,york,young,yum,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_test_dtm = vect.transform(X_test)
demotest = pd.DataFrame(X_test_dtm.toarray())
demotest.columns = vect.get_feature_names_out()
demotest

Unnamed: 0,00,10,100,11,12,13,14,15,16,18,...,year,years,yelp,yes,yesterday,yogurt,york,young,yum,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,3,0,0


In [11]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

print("Number of Features")
print(X_train_dtm.shape[1])
print("Training Accuracy")
print(nb.score(X_train_dtm,y_train))
print("Testing Accuracy")
print(nb.score(X_test_dtm,y_test))
print("Confusion Matrix")
print(confusion_matrix(y_test,y_pred_class))
print("Classifcation Report")
print(classification_report(y_test,y_pred_class))

Number of Features
1352
Training Accuracy
0.9445169712793734
Testing Accuracy
0.9050880626223092
Confusion Matrix
[[147  37]
 [ 60 778]]
Classifcation Report
              precision    recall  f1-score   support

           1       0.71      0.80      0.75       184
           5       0.95      0.93      0.94       838

    accuracy                           0.91      1022
   macro avg       0.83      0.86      0.85      1022
weighted avg       0.91      0.91      0.91      1022



In [12]:
pipe = Pipeline((
("vect",CountVectorizer(stop_words='english',lowercase=True)),
("nb",MultinomialNB()) ,
))
pipe.fit(X_train,y_train)
y_pred_class = pipe.predict(X_test)

print("Training Accuracy")
print(pipe.score(X_train,y_train))
print("Testing Accuracy")
print(pipe.score(X_test,y_test))
print("Confusion Matrix")
print(confusion_matrix(y_test,y_pred_class))
print("Classifcation Report")
print(classification_report(y_test,y_pred_class))

Training Accuracy
0.9758485639686684
Testing Accuracy
0.9158512720156555
Confusion Matrix
[[124  60]
 [ 26 812]]
Classifcation Report
              precision    recall  f1-score   support

           1       0.83      0.67      0.74       184
           5       0.93      0.97      0.95       838

    accuracy                           0.92      1022
   macro avg       0.88      0.82      0.85      1022
weighted avg       0.91      0.92      0.91      1022



### Building a Deep Learning Model

In [13]:
yelp['stars'] = yelp['stars'].apply(lambda x: x-1)
yelp.describe()

Unnamed: 0,stars,cool,useful,funny
count,10000.0,10000.0,10000.0,10000.0
mean,2.7775,0.8768,1.4093,0.7013
std,1.214636,2.067861,2.336647,1.907942
min,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0
50%,3.0,0.0,1.0,0.0
75%,4.0,1.0,2.0,1.0
max,4.0,77.0,76.0,57.0


In [14]:
# define X and y
X = yelp.text
y = yelp.stars

In [15]:
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',lowercase=True,min_df=10)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print("Number of Features")
print(X_train_dtm.shape[1])

Number of Features
6772


In [18]:
#Start building a Keras Sequential Model

tf.keras.backend.clear_session()
model = tf.keras.Sequential()




In [19]:
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [20]:
model.add(tf.keras.layers.Reshape((6772,),input_shape=(6772,)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(5, activation='softmax'))

In [21]:
model.summary()

In [22]:
adam_op = tf.keras.optimizers.Adam()

model.compile(optimizer=adam_op, loss='categorical_crossentropy', metrics=[tf.keras.metrics.Recall()])

In [23]:
model.fit(X_train_dtm.toarray(), y_train,
           validation_data=(X_test_dtm.toarray(), y_test),
           epochs=10, batch_size=128)

Epoch 1/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 1.7340 - recall: 0.0154 - val_loss: 1.5571 - val_recall: 0.0000e+00
Epoch 2/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.3338 - recall: 0.1300 - val_loss: 1.4803 - val_recall: 0.0100
Epoch 3/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.0764 - recall: 0.3202 - val_loss: 1.4377 - val_recall: 0.0684
Epoch 4/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.8438 - recall: 0.4201 - val_loss: 1.4040 - val_recall: 0.1680
Epoch 5/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.4736 - recall: 0.7689 - val_loss: 1.4199 - val_recall: 0.3732
Epoch 6/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.2278 - recall: 0.9102 - val_loss: 1.6126 - val_recall: 0.4280
Epoch 7/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1fb19bac2d0>

## Including the Meta variables in the analysis

In [25]:
# define a function that accepts text and returns the polarity

def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [95]:
# create a new DataFrame column for sentiment
yelp_best_worst['sentiment'] = yelp_best_worst.text.apply(detect_sentiment)
yelp_best_worst.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,sentiment
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0,0.402469
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,0.229773
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0,0.608646
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,0.468125
6,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4,0.300645


In [96]:
X = yelp_best_worst[['text','cool','useful','funny','sentiment']]
y = yelp_best_worst['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify= y)

In [98]:
# shape of other four feature columns

X_train.drop('text', axis=1).shape

(3064, 4)

In [99]:
# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(X_train.drop('text', axis=1).astype(float))
extra.shape

(3064, 4)

In [100]:
vect = TfidfVectorizer(ngram_range=(1, 2),stop_words='english',lowercase=True, min_df= 10)
X_train_dtm = vect.fit_transform(X_train['text'])
X_train_dtm = sp.sparse.csr_matrix(X_train_dtm)
X_test_dtm = vect.transform(X_test['text'])
X_test_dtm = sp.sparse.csr_matrix(X_test_dtm)

print(X_train_dtm.shape)

(3064, 2800)


In [101]:
# combine sparse matrices
X_train_dtm_extra = sp.sparse.hstack([X_train_dtm, extra])
X_train_dtm_extra.shape

(3064, 2804)

In [102]:
# repeat for testing set

extra = sp.sparse.csr_matrix(X_test.drop('text', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape

(1022, 2804)

In [103]:
# use logistic regression with text column only

logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print (metrics.accuracy_score(y_test, y_pred_class))

0.9285714285714286


In [104]:
# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm_extra, y_train)
y_pred_class = logreg.predict(X_test_dtm_extra)
print (metrics.accuracy_score(y_test, y_pred_class))

0.9295499021526419
