In [None]:
# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report



# Import our metrics to evaluate our model
from sklearn import metrics


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# You may need to download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')


### Current Question : Can we use sentiment analysis to accurately predict the stars left by a reviewer?
Are long reviews typically bad or good?


# Import Data

In [None]:
df_review = pd.read_json("/Users/jared/Coding/AWS/archive/yelp_academic_dataset_review.json", lines = True)
df_review.head()

In [None]:
df_review.columns

In [None]:
type(df_review['date'][0])

# Check for nulls and dupes

In [None]:
#Number of missing reviews
df_review["text"].isnull().sum()

In [None]:
df_review.duplicated().sum()

# Descriptive Statistics

In [None]:
#Descriptive Statistics of every numeric condition
df_review.describe().round(2)

In [None]:
#Number of Reviews
len(df_review["text"])

In [None]:
word_count = 0
for col in df_review["text"]:
    words = col.lower().replace("\n","").split(" ")
    word_count += len(words)
print(f'{word_count/len(df_review["text"])} is the average amount of words per review')


In [None]:
#Total Characters
count = 0
for col in df_review["text"]:
    col = col.replace("\n","") 
    count += len(col)

print(f'{count/len(df_review["text"])} is the average amount of characters per review')

# Feature Engineering

1. Lowercase all words
2. Remove all punctuation.
3. Remove all stopwords.
4. Stem words

## Clean up the reviews

In [84]:

#Lowercase all of the words in the review

def make_lower(a_string):
    return a_string.lower()

In [85]:

#Remove all punctuation

def remove_punctuation(a_string):
    a_string = re.sub(r'[^\w\s]','',a_string)
    return a_string

test = "'This is a sentence! 50 With lots of punctuation??? & other #things.'"
remove_punctuation(test)

'This is a sentence 50 With lots of punctuation  other things'

In [86]:

a_string = 'This is a sentence!  With lots of punctuation??? & other #things.'
words = word_tokenize(a_string)
words

['This',
 'is',
 'a',
 'sentence',
 '!',
 'With',
 'lots',
 'of',
 'punctuation',
 '?',
 '?',
 '?',
 '&',
 'other',
 '#',
 'things',
 '.']

In [87]:

#Remove all stop words

def remove_stopwords(a_string):
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)

    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words

    for word in words:

        # Check if word is not in stopwords
        if word not in stopwords:

            # If word not in stopwords, append to our valid_words
                valid_words.append(word)
    
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

a_sentence = 'This is a sentence! With some different stopwords i have'

remove_stopwords(a_sentence)

'This sentence ! With different stopwords'

# Building a text processing pipeline

In [90]:
def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    input_string = remove_stopwords(input_string)
    return input_string

# df_review['cleaned_text'] = df_review['text'].apply(text_pipeline)

# print(df_review['text'][0])
# print("-" * 50)
# print(df_review['cleaned_text'][0])

# Check the counts with the claned up text

In [None]:
word_count = 0
for col in df_review["cleaned_text"]:
    words = col.split(" ")
    word_count += len(words)
print(f'{word_count/len(df_review["cleaned_text"])} is the average amount of words per review')

In [None]:
#Total Characters
count = 0
for col in df_review["cleaned_text"]:
    count += len(col)

print(f'{count/len(df_review["cleaned_text"])} is the average amount of characters per review')

In [None]:
df_review.head()

# Splitting and Training

In [None]:
# Define our `X` and `y` data. 

X = df_review['cleaned_text'].values

y = df_review['stars'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2, random_state=42)

X_train_text = X_train
X_test_text = X_test

# Initialize Vectorizer and fit

In [None]:
vectorizer = TfidfVectorizer()

vectorizer.fit(X_train)

X_train, X_test = vectorizer.transform(X_train),vectorizer.transform(X_test)

print(X_train.shape, type(X))

In [None]:
features = vectorizer.get_feature_names()
weights = vectorizer.idf_

print(len(features), len(weights))

df_idf = pd.DataFrame.from_dict( {'feature': features, 'idf': weights})

df_idf = df_idf.sort_values(by='idf', ascending=False)

df_idf

# Build and train our model

In [None]:
model = MultinomialNB(alpha = .05)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

y_pred_proba = model.predict_proba(X_test)

In [None]:
print(classification_report(y_test, y_pred))

# Joining the data together

In [None]:
# Useful for matching business_id, stars, and possibly (review_count)
# Calculate to see how my average compares to the main average
# Question: Does the location of your business influence the number of stars that your business gets?
# Hypothesis: Location does affect the number of stars that your business gets
df_business = pd.read_json("/Users/jared/Coding/AWS/archive/yelp_academic_dataset_business.json", lines = True)
df_business.head(5)

In [None]:
condition = df_business['categories'].str.contains('Food', case=False, na=False)
df_food = df_business[condition]
df_food

In [91]:
final_df = df_food.merge(df_review,how='inner',on='business_id')
final_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",BXQcBN0iAi1lAUxibGLFzA,6_SpY41LIHZuIaiDs5FMKA,4,0,0,1,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",uduvUCvi9w3T2bSGivCfXg,tCXElwhzekJEH6QJe3xs7Q,4,3,1,2,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",a0vwPOqDXXZuJkbBW2356g,WqfKtI-aGMmvbA9pPUxNQQ,5,0,0,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",MKNp_CdR2k2202-c8GN5Dw,3-1va0IQfK-9tUMzfHWfTA,5,5,0,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",D1GisLDPe84Rrk_R4X2brQ,EouCKoDfzaVG0klEgdDvCQ,4,2,1,1,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2556120,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,DE,19014,39.856185,-75.427725,4.5,14,...,"Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",Kt3gFeW1rhZz7RuiV-6Tcw,eWz12w7dzYlfrGnhTQ82Fg,5,0,0,0,This is my favorite food truck! I only wish I ...,2019-07-14 14:25:35
2556121,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,DE,19014,39.856185,-75.427725,4.5,14,...,"Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",ruy3Ycey_gGbwkE_3TX1Fg,lDyhGApbGZ0_BoeJzRQq7g,5,1,0,1,This food truck was stupid. Stupidly delicious...,2021-06-25 23:22:26
2556122,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,DE,19014,39.856185,-75.427725,4.5,14,...,"Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",C_l8NTpvNOEUorEmEOusaA,-TTJ75--0NEAjvFCOV7rBg,5,0,0,0,Bubba never disappoints i go to his fb page an...,2016-12-09 21:38:05
2556123,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,DE,19014,39.856185,-75.427725,4.5,14,...,"Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",q39JOIkHmIhdmYnjEhZCdQ,8yFNNU7UmQcfzmcTvzTlOA,1,0,0,0,The truck was invited to our office for a part...,2020-02-19 22:59:06


In [92]:
final_df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars_x', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'review_id', 'user_id', 'stars_y',
       'useful', 'funny', 'cool', 'text', 'date'],
      dtype='object')

In [94]:
final_df.drop(columns = ['name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude'])

Unnamed: 0,business_id,stars_x,review_count,is_open,attributes,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
0,MTSW4McQd7CbVtyjqoe9mw,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",BXQcBN0iAi1lAUxibGLFzA,6_SpY41LIHZuIaiDs5FMKA,4,0,0,1,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53
1,MTSW4McQd7CbVtyjqoe9mw,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",uduvUCvi9w3T2bSGivCfXg,tCXElwhzekJEH6QJe3xs7Q,4,3,1,2,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06
2,MTSW4McQd7CbVtyjqoe9mw,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",a0vwPOqDXXZuJkbBW2356g,WqfKtI-aGMmvbA9pPUxNQQ,5,0,0,0,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57
3,MTSW4McQd7CbVtyjqoe9mw,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",MKNp_CdR2k2202-c8GN5Dw,3-1va0IQfK-9tUMzfHWfTA,5,5,0,5,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57
4,MTSW4McQd7CbVtyjqoe9mw,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",D1GisLDPe84Rrk_R4X2brQ,EouCKoDfzaVG0klEgdDvCQ,4,2,1,1,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2556120,2O2K6SXPWv56amqxCECd4w,4.5,14,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",Kt3gFeW1rhZz7RuiV-6Tcw,eWz12w7dzYlfrGnhTQ82Fg,5,0,0,0,This is my favorite food truck! I only wish I ...,2019-07-14 14:25:35
2556121,2O2K6SXPWv56amqxCECd4w,4.5,14,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",ruy3Ycey_gGbwkE_3TX1Fg,lDyhGApbGZ0_BoeJzRQq7g,5,1,0,1,This food truck was stupid. Stupidly delicious...,2021-06-25 23:22:26
2556122,2O2K6SXPWv56amqxCECd4w,4.5,14,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",C_l8NTpvNOEUorEmEOusaA,-TTJ75--0NEAjvFCOV7rBg,5,0,0,0,Bubba never disappoints i go to his fb page an...,2016-12-09 21:38:05
2556123,2O2K6SXPWv56amqxCECd4w,4.5,14,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",q39JOIkHmIhdmYnjEhZCdQ,8yFNNU7UmQcfzmcTvzTlOA,1,0,0,0,The truck was invited to our office for a part...,2020-02-19 22:59:06


# Retrain only on Restaurants

In [None]:
final_df['cleaned_text'] = final_df['text'].apply(text_pipeline) #Reclean all of the text

In [96]:
print(final_df['text'][0])
print("-" * 50)
print(final_df['cleaned_text'][0])

This is nice little Chinese bakery in the heart of Philadelphia's Chinatown! The female cashier was very friendly (flirtatious!) and the pastries shown in nicely adorned display cases. I stopped by early one evening had a sesame ball, which was filled with bean paste. The glutinous rice of the ball was nicely flavored, similar to Bai Tang Gao. Definitely as place worth stopping at if you are in the area.
--------------------------------------------------
nice little chinese bakery heart philadelphias chinatown female cashier friendly flirtatious pastries shown nicely adorned display cases stopped early one evening sesame ball filled bean paste glutinous rice ball nicely flavored similar bai tang gao definitely place worth stopping area


In [97]:
# Define our `X` and `y` data. 

X = final_df['cleaned_text'].values

y = final_df['stars_y'].values

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2, random_state=42)

X_train_text = X_train
X_test_text = X_test

In [99]:
vectorizer = TfidfVectorizer()

vectorizer.fit(X_train)

X_train, X_test = vectorizer.transform(X_train),vectorizer.transform(X_test)

print(X_train.shape, type(X))

(2044900, 638242) <class 'numpy.ndarray'>


In [100]:
features = vectorizer.get_feature_names()
weights = vectorizer.idf_

print(len(features), len(weights))

df_idf = pd.DataFrame.from_dict( {'feature': features, 'idf': weights})

df_idf = df_idf.sort_values(by='idf', ascending=False)

df_idf

638242 638242




Unnamed: 0,feature,idf
319121,kompachi,14.837713
382517,neveryielding,14.837713
382537,new25,14.837713
382535,nevy,14.837713
382533,nevvvvver,14.837713
...,...,...
502726,service,2.315301
256875,great,2.131068
434732,place,2.055655
252000,good,2.042675


In [101]:
model = MultinomialNB(alpha = .05)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

y_pred_proba = model.predict_proba(X_test)

In [102]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.64      0.75      0.69     61876
           2       0.42      0.14      0.20     40514
           3       0.41      0.14      0.21     54986
           4       0.43      0.26      0.32    117792
           5       0.66      0.93      0.77    236057

    accuracy                           0.61    511225
   macro avg       0.51      0.44      0.44    511225
weighted avg       0.56      0.61      0.55    511225

