In [380]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [381]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array

def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key not in obj:
                obj[key] = 0
                print("Key {} not found in json".format(key))
    return json_array

# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array

def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df

In [382]:

filename = 'yelp_academic_dataset_review.json'
df = ConvertJSONFileToDataFrame(filename)
df.head(10)

Unnamed: 0,stars,useful,funny,cool,text
0,3.0,0,0,0,"If you decide to eat here, just be aware it is..."
1,5.0,1,0,1,I've taken a lot of spin classes over the year...
2,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...
3,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo..."
4,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...
5,1.0,1,2,1,I am a long term frequent customer of this est...
6,5.0,0,2,0,Loved this tour! I grabbed a groupon and the p...
7,5.0,2,0,0,Amazingly amazing wings and homemade bleu chee...
8,3.0,1,1,0,This easter instead of going to Lopez Lake we ...
9,3.0,0,0,0,Had a party of 6 here for hibachi. Our waitres...


In [383]:
train, test = train_test_split(df, test_size=0.1, random_state=89, shuffle=True)

In [384]:
df = train.groupby(by='stars').agg('count').reset_index()

In [385]:
df

Unnamed: 0,stars,useful,funny,cool,text
0,1.0,99,99,99,99
1,2.0,70,70,70,70
2,3.0,116,116,116,116
3,4.0,210,210,210,210
4,5.0,405,405,405,405


In [386]:
oneCount = int(df.loc[df['stars']==1.0]['text'].astype(int))

  oneCount = int(df.loc[df['stars']==1.0]['text'].astype(int))


In [387]:
twoCount = int(df.loc[df['stars']==2.0]['text'].astype(int))
threeCount = int(df.loc[df['stars']==3.0]['text'].astype(int))
fourCount = int(df.loc[df['stars']==4.0]['text'].astype(int))
fiveCount = int(df.loc[df['stars']==5.0]['text'].astype(int))

  twoCount = int(df.loc[df['stars']==2.0]['text'].astype(int))
  threeCount = int(df.loc[df['stars']==3.0]['text'].astype(int))
  fourCount = int(df.loc[df['stars']==4.0]['text'].astype(int))
  fiveCount = int(df.loc[df['stars']==5.0]['text'].astype(int))


In [388]:
# Probability distribution
totalCount = oneCount + twoCount + threeCount + fourCount + fiveCount
p_catagory = {'1.0':oneCount/totalCount,'2.0':twoCount/totalCount,'3.0':threeCount/totalCount,'4.0':fourCount/totalCount,'5.0':fiveCount/totalCount}
ps_catagory = [float(oneCount/totalCount),twoCount/totalCount,threeCount/totalCount,fourCount/totalCount,fiveCount/totalCount]

In [389]:
ps_catagory

[0.11, 0.07777777777777778, 0.1288888888888889, 0.23333333333333334, 0.45]

In [390]:
bigDict = [{},{},{},{},{}]
allwords = {}

In [391]:
for row_index, row in train.iterrows():
    c = int(row["stars"])-1
    if(c == -1):
        print("WARNING!")
    content = row['text']
    for word in content.lower().split():
        if word in bigDict[c]:
            bigDict[c][word] += 1.0
        else:
            bigDict[c][word] = 1.0
        if word not in allwords:
            allwords[word] = 1.0
        else:
            allwords[word] += 1.0


for word in allwords.keys():
    for dict in bigDict:
        if word in dict:
            dict[word] += 1.0
        else:
            dict[word] = 1.0

In [392]:
# Discard all 1 mentions
allwordssorted = {k: v for k, v in sorted(allwords.items(), key=lambda item: item[1])}

In [393]:
from math import floor
threshold = list(allwordssorted.keys())[floor(len(allwordssorted)*0.98)]

In [394]:
toDelete = []
for word in allwordssorted.keys():
    if allwordssorted[word] <= 1.0:
        toDelete.append(word)
    elif allwordssorted[word] > allwordssorted[threshold]:
        toDelete.append(word)

for word in toDelete:
    del(allwordssorted[word])
    for cat in bigDict:
        del(cat[word])
    del(allwords[word])

In [395]:
# Normalize
for dict in bigDict:
    totalCount = 0
    for word in dict:
        totalCount += dict[word]
    for word in dict:
        dict[word] = float(dict[word]) / (float(totalCount)+len(allwords))

In [396]:
def median(nums):
    nums = sorted(nums)
    middle1 = (len(nums) - 1) // 2
    middle2 = len(nums) // 2
    return (nums[middle1] + nums[middle2]) / 2

In [397]:
medianVector = []
for dict in bigDict:
    medianVector.append(median(dict.values()))

In [398]:
#medianVector

In [399]:
predictionSet = []
# Build Train Model
for row_index, row in test.iterrows():
    actual = row["stars"]
    content = row['text']
    p_chance = ps_catagory.copy()
    for w in content.lower().split():
        for i in range(5):
            if w in bigDict[i]:
                p_chance[i] = float(p_chance[i] * (bigDict[i][w]))
    pred = p_chance.index(max(p_chance))
    if(pred == 0):
        predictionSet.append(1.0)
    elif(pred==1):
        predictionSet.append(2.0)
    elif(pred==2):
        predictionSet.append(3.0)
    elif(pred==3):
        predictionSet.append(4.0)
    elif(pred==4):
        predictionSet.append(5.0)
        

In [400]:
test['stars']

746    5.0
36     5.0
786    5.0
607    2.0
108    4.0
      ... 
169    5.0
543    5.0
839    5.0
683    4.0
322    5.0
Name: stars, Length: 100, dtype: float64

In [516]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, mean_squared_error

In [402]:
print(classification_report(test['stars'], predictionSet))

              precision    recall  f1-score   support

         1.0       0.60      0.46      0.52        13
         2.0       0.00      0.00      0.00         4
         3.0       0.33      0.10      0.15        10
         4.0       0.23      0.17      0.19        18
         5.0       0.64      0.84      0.72        55

    accuracy                           0.56       100
   macro avg       0.36      0.31      0.32       100
weighted avg       0.50      0.56      0.52       100



In [504]:
from sklearn import linear_model
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.preprocessing import  OneHotEncoder

In [472]:
#vectorizer = TfidfVectorizer(sublinear_tf=True)
#X = vectorizer.fit_transform(train['text'])

In [473]:
#vectorizer.vocabulary_

{'thoroughly': 6992,
 'enjoyed': 2437,
 'our': 4833,
 'private': 5356,
 'tour': 7119,
 'today': 7066,
 'with': 7689,
 'carrie': 1214,
 'it': 3705,
 'was': 7551,
 'very': 7445,
 'rainy': 5499,
 'so': 6356,
 'not': 4687,
 'too': 7093,
 'many': 4226,
 'visitors': 7483,
 'her': 3343,
 'knowledge': 3876,
 'and': 370,
 'passion': 4986,
 'about': 154,
 'the': 6955,
 'civil': 1439,
 'war': 7539,
 'captivating': 1182,
 'battle': 712,
 'of': 4740,
 'franklin': 2900,
 'deserves': 2054,
 'such': 6699,
 'memorial': 4339,
 'there': 6966,
 'is': 3696,
 'great': 3148,
 'selection': 6067,
 'whiskey': 7638,
 'bourbon': 941,
 'to': 7062,
 'suit': 6716,
 'anyone': 414,
 'taste': 6861,
 'we': 7583,
 'had': 3224,
 'burgers': 1084,
 'fried': 2936,
 'egg': 2367,
 'on': 4775,
 'soo': 6398,
 'good': 3101,
 'probably': 5360,
 'one': 4777,
 'best': 788,
 've': 7412,
 'ever': 2518,
 'duck': 2292,
 'fries': 2941,
 'are': 472,
 'must': 4552,
 'if': 3522,
 'you': 7793,
 'come': 1553,
 'here': 3347,
 'cute': 1902,
 'a

In [495]:
#count_vec_pipeline = Pipeline([
#    ('one_hot', OneHotEncoder()),
#    ('count_vectorizer', ColumnTransformer([
#        ('name', CountVectorizer(stop_words='english'), 'name')
#    ])),
#    ('linear_regression', LinearRegression())
#])

lengther = len(train['cool'])
targetVectorCool = [0] * lengther
targetVectorUseful = [0] * lengther
targetVectorFunny = [0] * lengther
for i in range(lengther):
    targetVectorCool[i] = train.iloc[i]['cool']
    targetVectorUseful[i] = train.iloc[i]['useful']
    targetVectorFunny[i] = train.iloc[i]['funny']

In [498]:
vectorizer = CountVectorizer(stop_words='english',  ngram_range=(1, 1), dtype='double')

In [499]:
data = vectorizer.fit_transform(train['text'])

In [501]:
vectors = data.copy()

In [505]:
pca = decomposition.TruncatedSVD(n_components=50)

In [506]:
data = pca.fit_transform(data)

In [520]:
regressionCool = linear_model.BayesianRidge()
regressionCool.fit(data,targetVectorCool)
predictCool = regressionCool.predict(pca.fit_transform(vectorizer.fit_transform(test['text'])))
mean_squared_error(test['cool'], predictCool)

0.6703749179409435

In [521]:
regressionUseful = linear_model.BayesianRidge()
regressionUseful.fit(data,targetVectorUseful)
predictUseful = regressionCool.predict(pca.fit_transform(vectorizer.fit_transform(test['text'])))
mean_squared_error(test['useful'], predictUseful)

3.680406002044074

In [522]:
regressionFunny = linear_model.BayesianRidge()
regressionFunny.fit(data,targetVectorFunny)
predictFunny = regressionCool.predict(pca.fit_transform(vectorizer.fit_transform(test['text'])))
mean_squared_error(test['funny'], predictFunny)

1.1300058002650268

In [512]:
predictCool = regressionCool.predict(pca.fit_transform(vectorizer.fit_transform(test['text'])))

In [518]:
mean_squared_error(test['cool'], predictCool)

0.6536732265718171

In [404]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score

In [252]:
# tfidf = TfidfVectorizer(sublinear_tf = True, max_features = 30, max_df = 0.38)
# sublinear_tf = True?

In [253]:
# vectorizer = tfidf.fit_transform(train['text'])

In [254]:
# pd.DataFrame(vectorizer.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,all,are,as,at,be,food,from,get,good,great,...,service,so,their,there,time,very,we,were,would,you
0,0.000000,0.000000,0.000000,0.204379,0.000000,0.000000,0.235566,0.000000,0.194095,0.000000,...,0.217618,0.000000,0.000000,0.000000,0.000000,0.000000,0.414511,0.000000,0.242017,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.567199,0.000000,0.000000,0.585857,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.282872,0.544691,0.000000,0.442673,0.000000,0.240387,0.000000,0.000000,0.000000,0.245320,...,0.000000,0.000000,0.319992,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.250463
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.592565,0.000000,0.347101,0.000000,0.000000,...,0.326994,0.000000,0.000000,0.323166,0.000000,0.309944,0.000000,0.000000,0.000000,0.000000
4,0.308100,0.000000,0.000000,0.482153,0.484819,0.000000,0.328220,0.000000,0.000000,0.267199,...,0.000000,0.272800,0.000000,0.000000,0.000000,0.287402,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.000000,0.570111,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
796,0.425793,0.390685,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.218096,...,0.000000,0.222668,0.000000,0.414132,0.000000,0.234586,0.380333,0.235458,0.000000,0.000000
797,0.000000,0.000000,0.313156,0.438851,0.000000,0.000000,0.328201,0.000000,0.000000,0.267184,...,0.179072,0.161111,0.000000,0.000000,0.189676,0.356207,0.000000,0.357532,0.000000,0.000000
798,0.249519,0.000000,0.000000,0.390479,0.000000,0.212043,0.000000,0.441337,0.000000,0.216395,...,0.245562,0.000000,0.000000,0.000000,0.000000,0.000000,0.581591,0.233623,0.000000,0.000000


In [276]:
# train[['stars','cool']]

Unnamed: 0,stars,cool
349,2.0,0
499,5.0,0
467,4.0,1
293,5.0,0
580,1.0,0
...,...,...
893,3.0,0
968,5.0,0
295,4.0,0
666,2.0,0


In [256]:
# count_vector = CountVectorizer(max_features = 60, max_df = 0.6,stop_words = 'english')
# countVect = count_vector.fit_transform(train['text'])
# pd.DataFrame(countVect.toarray(), columns=count_vector.get_feature_names_out())

Unnamed: 0,amazing,area,bar,best,better,burger,came,cheese,chicken,come,...,table,time,told,try,ve,wait,want,wasn,way,went
0,0,0,1,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,1,2,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,0,0,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [277]:
# model_tfidf = make_pipeline(TfidfVectorizer(), MultinomialNB())
# model_count = make_pipeline(CountVectorizer(max_features = 60, max_df = 0.6,stop_words = 'english'), MultinomialNB())

In [278]:
# model_tfidf.fit(train['text'].head(20), train['stars'].head(20))

ValueError: y should be a 1d array, got an array of shape (20, 2) instead.

In [271]:
# model_tfidf.predict(train['text'].head(20))

array([4., 5., 4., 5., 5., 4., 4., 3., 4., 3., 5., 5., 4., 5., 5., 3., 5.,
       5., 3., 4.])

In [272]:
# train['stars'].head(20)

349    2.0
499    5.0
467    4.0
293    5.0
580    1.0
896    4.0
12     4.0
459    3.0
277    4.0
237    3.0
156    5.0
483    5.0
537    4.0
73     1.0
474    5.0
586    3.0
121    5.0
407    5.0
943    3.0
4      4.0
Name: stars, dtype: float64

In [280]:
# model_tfidf.fit(train['text'], train[['stars','cool']])
# model_count.fit(train['text'], train['stars'])

ValueError: y should be a 1d array, got an array of shape (800, 2) instead.

In [282]:
# train['useful'].head(10)

349    4
499    1
467    3
293    1
580    1
896    0
12     0
459    1
277    0
237    0
Name: useful, dtype: int64

In [263]:
# y_pred_tfidf = model_tfidf.predict(test['text'])
# y_pred_count = model_count.predict(test['text'])

In [264]:
# y_pred_tfidf

array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.])

In [265]:
# f1 = f1_score(test['stars'], y_pred_count, average='weighted')
# accuracy = accuracy_score(test['stars'], y_pred_count)
# print('Multinomial Naive Bayes with Word Count:')
# print('-' * 40)
# print(f'f1: {f1:.4f}')
# print(f'accuracy: {accuracy:.4f}')

Multinomial Naive Bayes with Word Count:
----------------------------------------
f1: 0.4682
accuracy: 0.4850


In [266]:
# print(classification_report(test['stars'], y_pred_tfidf))

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        27
         2.0       0.00      0.00      0.00        11
         3.0       0.00      0.00      0.00        22
         4.0       0.00      0.00      0.00        39
         5.0       0.51      1.00      0.67       101

    accuracy                           0.51       200
   macro avg       0.10      0.20      0.13       200
weighted avg       0.26      0.51      0.34       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [267]:
# print(classification_report(test['stars'], y_pred_count))

              precision    recall  f1-score   support

         1.0       0.43      0.37      0.40        27
         2.0       0.08      0.09      0.09        11
         3.0       0.12      0.09      0.11        22
         4.0       0.27      0.23      0.25        39
         5.0       0.65      0.74      0.69       101

    accuracy                           0.48       200
   macro avg       0.31      0.31      0.31       200
weighted avg       0.46      0.48      0.47       200



In [103]:
# print("Accuracy: {}".format(accuracy_score(test['stars'], predicted_categories)))

Accuracy: 0.51


In [91]:
# test['stars']

507    4.0
818    3.0
452    5.0
368    4.0
242    5.0
      ... 
671    5.0
559    4.0
593    3.0
258    3.0
154    1.0
Name: stars, Length: 100, dtype: float64