In [1]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve

In [2]:
#connecting to folder with review files
path = os.getcwd()
google_folder = os.listdir(path+"/Google_Reviews")
yelp_folder = os.listdir(path+"/Yelp_Reviews")


In [3]:
google_folder.remove('.ipynb_checkpoints') #removing checkpoint file

#This list comprehension reads all the csv files and creates a list of pandas dataframes.
#This list is then combined.
google_reviews = pd.concat([pd.read_csv(path + "/Google_Reviews/" + f) for f in google_folder])


In [4]:
#same process as above
yelp_folder.remove('.ipynb_checkpoints')
yelp_reviews = pd.concat([pd.read_csv(path + "/Yelp_Reviews/" + f) for f in yelp_folder])

In [42]:
yelp_reviews.to_csv('Written_Yelp_Reviews.csv')

In [5]:
#Creating categories of sentiments.
#Each review had an associated rating. I decided that ratings 4, 5 were considered good.
#Ratings that were 1,2, or 3 were classified as bad ratings.

def sentiment(review_rating):
    if review_rating > 3:
        return('Good')
    elif review_rating <= 3:
        return('Bad')

In [6]:
#creating a sentiment column using the above function
google_reviews['sentiment'] = google_reviews['rating'].apply(sentiment)

In [7]:
#Observation: There are a lot more good reviews than bad. I would have assumed the opposite.
google_reviews.sentiment.value_counts()

Good    306
Bad      54
Name: sentiment, dtype: int64

In [8]:
#creating sentiment column for yelp reviews
yelp_reviews['sentiment'] = yelp_reviews['rating'].apply(sentiment)

In [9]:
yelp_reviews.sentiment.value_counts()

Good    346
Bad     184
Name: sentiment, dtype: int64

In [12]:
#removing NA reviews. I am not sure why the google API has NA reviews
google_reviews = google_reviews[google_reviews['text'].notna()]

In [13]:
#reseting index after combining dataframes. Allows for easier evaluation of model.
yelp_reviews = yelp_reviews.reset_index()
google_reviews = google_reviews.reset_index()

In [14]:
#I will be using Yelp reviews as my training data. Yelp has more reviews and a more even split between good/bad reviews.
#Google reviews are my test set to see how accurate the model truely is.

X_train = yelp_reviews.text # feature matrix
y_train = yelp_reviews.sentiment #target vector
X_test = google_reviews.text #test 
y_test = google_reviews.sentiment #test

In [15]:
#using a count vectorizer
#this is an easy way to tokenize the words, remove stop words and teach the computer vocabulary

vect = CountVectorizer(stop_words='english',max_features=1000,min_df=10)
vect.fit(X_train) #fitting the countvectorizer 

CountVectorizer(max_features=1000, min_df=10, stop_words='english')

In [16]:
X_train_dtm = vect.transform(X_train) #transforms the data into vector
X_test_dtm = vect.transform(X_test)

In [17]:
model = MultinomialNB() #initializing a Niave Bayes model
model.fit(X_train_dtm, y_train) #fitting the model with a Yelp review matrix

MultinomialNB()

In [19]:
y_test_pred = model.predict(X_test_dtm) #using google reviews to create a prediction matrix

In [20]:
confusion_matrix(y_test, y_test_pred) 

array([[ 23,  30],
       [ 25, 267]])

In [21]:
accuracy_score(y_test, y_test_pred)

0.8405797101449275

In [22]:
X_test[(y_test=='Good') & (y_test_pred=='Bad')].head(5)

27    4 Stars because customer service was great. Th...
33    I went here to try it out in the middle of the...
36    Their 300th review! I very much enjoyed the Th...
44    Their tequila tasting choices could use some w...
57    They were busy, like - I had to wait outside b...
Name: text, dtype: object

In [23]:
X_test[27] #Example of a review that was said to be bad but was actually good.

'4 Stars because customer service was great. The guy that took my order was friendly, chipper, went out of his way to be nice. The pad thai: Left something to be desired. Not great, tasted like jarred pad thai sauce. Will give another chance because of the great customer service.'

In [24]:
X_test[(y_test=='Bad') & (y_test_pred=='Good')].head(5)

2     Truly awful. i dont leave bad reviews like som...
6     Coffee was weak and definitely not worth the p...
11    Nice ambiance with outdoor seating. Service wa...
21    I'll give it three stars because the service w...
28    Visited on 08/02/21 at about 1:30pm and will N...
Name: text, dtype: object

In [25]:
X_test[2] #example of a review that was classified as good but is actually bad.

'Truly awful. i dont leave bad reviews like some other reviewers have said but this restaurant isnt even trying. Pad thai was soggy overcooked noodles green in color flavored with some curry powder red bell peppers and mostly beansprouts. No eggs no pickled ginger no peanuts no tofu. I dont think hes ever tasted pad thai before'

In [27]:
model.classes_ #getting correct index so I can make a dataframe of words

array(['Bad', 'Good'], dtype='<U4')

In [32]:
words = vect.get_feature_names()
Good = model.feature_count_[0,:]
Bad = model.feature_count_[1,:]
review_word_predict = pd.DataFrame({'word' : words, 'Good' : Good, 'Bad' : Bad})
review_word_predict.head(5)
#This dataframe shows counts of how many times the word was included in a review that was classified as good or bad.

Unnamed: 0,word,Good,Bad
0,10,4.0,8.0
1,absolutely,2.0,8.0
2,amazing,3.0,23.0
3,atmosphere,1.0,14.0
4,awesome,1.0,13.0


In [37]:
#sorted by good words.
review_word_predict.sort_values(by='Good', ascending=False).head(10)

Unnamed: 0,word,Good,Bad
47,food,53.0,93.0
52,good,27.0,71.0
78,place,27.0,82.0
92,service,27.0,46.0
74,ordered,22.0,13.0
106,time,21.0,21.0
73,order,21.0,15.0
56,just,19.0,22.0
36,don,19.0,15.0
53,got,18.0,21.0


In [38]:
#sorted by bad words
review_word_predict.sort_values(by='Bad', ascending=False).head(10)

Unnamed: 0,word,Good,Bad
54,great,14.0,97.0
47,food,53.0,93.0
78,place,27.0,82.0
52,good,27.0,71.0
68,missoula,15.0,53.0
92,service,27.0,46.0
9,best,5.0,42.0
62,love,7.0,34.0
95,staff,4.0,32.0
49,friendly,5.0,31.0
