#### CSC 180-01 Intelligent Systems (Fall 2022)
##### Group: Jonathan Jakab,  ,
##### California State University, Sacramento

# Project 1: Yelp Business Rating Prediction using Tensorflow

## Imports
Contains the imports for the applications. 

In [87]:
#You may use the following code to convert JSON data into a tabular format Pandas can read.
import os
import csv
import json
import shutil
import numpy as np
import pandas as pd

from collections.abc import Sequence

import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import sklearn.feature_extraction.text as sk_text

from collections.abc import Sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

## Helpful Functions for Tensorflow (from Labs)

The following functions will be used with TensorFlow to help preprocess the data.  They allow you to build the feature vector for a neural network. 

* Predictors/Inputs 
    * Fill any missing inputs with the median for that column.  Use **missing_median**.
    * Encode textual/categorical values with **encode_text_dummy**.
    * Encode numeric values with **encode_numeric_zscore**.
* Output
    * Discard rows with missing outputs.
    * Encode textual/categorical values with **encode_text_index**.
    * Do not encode output numeric values.
* Produce final feature vectors (x) and expected output (y) with **to_xy**.

In [77]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

    
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

    
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low    

## Input/Output Setup
* ***Checks directories for Data input and output***
* ***Reads in data from the review file in the data directory***

In [78]:
# Get current working directory
current_directory = os.getcwd()


# Sets up input file locations for the data input.
dataPath = "./data/"
dataDirectory = os.path.join(current_directory, dataPath)
inputFile_Review = os.path.join(dataPath,"yelp_academic_dataset_review.json")
inputFile_Business = os.path.join(dataPath,"yelp_academic_dataset_business.json")


# Sets up locations for the data output location.
savePath = "./output/"
saveDirectory = os.path.join(current_directory, savePath)

if os.path.exists(saveDirectory):
    outputFile_Review = os.path.join(savePath,"review_stars.tsv")
    outputFile_Business = os.path.join(savePath,"review_business.tsv")
else:
    os.makedirs(saveDirectory)
    outputFile_Review = os.path.join(savePath,"review_stars.tsv")
    outputFile_Business = os.path.join(savePath,"review_business.tsv")

## Data Processing
### Read in reviews from yelp_academic_dataset_review.JSON

In [58]:
# Opens output files and loads data from input file.
outfile = open(outputFile_Review, 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])

with open(inputFile_Review, encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])
outfile.close()

df_review = pd.read_csv(outputFile_Review, delimiter ="\t", encoding="utf-8")
df_review


Unnamed: 0,business_id,stars,text
0,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,"b""If you decide to eat here, just be aware it ..."
1,7ATYjTIgM3jUlt4UM3IypQ,5.0,"b""I've taken a lot of spin classes over the ye..."
2,YjUWPpI6HXG530lwP-fb2A,3.0,b'Family diner. Had the buffet. Eclectic assor...
3,kxX2SOes4o-D3ZQBkiMRfA,5.0,"b""Wow! Yummy, different, delicious. Our fa..."
4,e4Vwtrqf-wpJfwesgvdgxQ,4.0,"b""Cute interior and owner (?) gave us tour of ..."
...,...,...,...
6990275,jals67o91gcrD4DC81Vk6w,5.0,b'Latest addition to services from ICCU is App...
6990276,2vLksaMmSEcGbjI5gywpZA,5.0,"b""This spot offers a great, affordable east we..."
6990277,R1khUUxidqfaJmcpmGd4aw,4.0,"b""This Home Depot won me over when I needed to..."
6990278,Rr9kKArrMhSLVE9a53q-aA,5.0,b'For when I\'m feeling like ignoring my calor...


### Read in businesses from yelp_academic_dataset_business.JSON

In [79]:
# Opens output files and loads data from input file.
outfile = open(outputFile_Business, 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','name', 'stars', 'review_count', 'categories'])

with open(inputFile_Business, encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['name'].encode('utf-8'), row['stars'], row['review_count'], (row['categories'])])
outfile.close()

df_business = pd.read_csv(outputFile_Business, delimiter ="\t", encoding="utf-8")
df_business

Unnamed: 0,business_id,name,stars,review_count,categories
0,Pns2l4eNsfO8kk83dixA6A,"b'Abby Rappoport, LAC, CMQ'",5.0,7,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,b'The UPS Store',3.0,15,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,b'Target',3.5,22,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,b'St Honore Pastries',4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,b'Perkiomen Valley Brewery',4.5,13,"Brewpubs, Breweries, Food"
...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,"b""Binh's Nails""",3.0,13,"Nail Salons, Beauty & Spas"
150342,c8GjPIOTGVmIemT7j5_SyQ,b'Wild Birds Unlimited',4.0,5,"Pets, Nurseries & Gardening, Pet Stores, Hobby..."
150343,_QAMST-NrQobXduilWEqSw,"b""Claire's Boutique""",3.5,8,"Shopping, Jewelry, Piercing, Toy Stores, Beaut..."
150344,mtGm22y5c2UHNXDFAjaPNw,b'Cyclery & Fitness Center',4.0,24,"Fitness/Exercise Equipment, Eyewear & Optician..."


### Clean up records. (Remove all businesses that have fewer then 20 reviews.)

In [80]:
df_business_cleaned = df_business[df_business["review_count"] >= 20]
df_business_cleaned

Unnamed: 0,business_id,name,stars,review_count,categories
2,tUFrWirKiKi_TAnsVWINQQ,b'Target',3.5,22,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,b'St Honore Pastries',4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
12,il_Ro8jwPlHresjw9EGmBg,"b""Denny's""",2.5,28,"American (Traditional), Restaurants, Diners, B..."
14,0bPLkL0QhhPO5kt1_EXmNQ,"b""Zio's Italian Market""",4.5,100,"Food, Delis, Italian, Bakeries, Restaurants"
15,MUTTqe8uqyMdBl186RmNeA,b'Tuna Bar',4.0,245,"Sushi Bars, Restaurants, Japanese"
...,...,...,...,...,...
150330,GeEveoOaU2YKD7jJtEfA_g,b'DeVons Jewelers',5.0,34,"Shopping, Watches, Jewelry"
150331,qQ7FHvkGEMqoPKKXPk4gjA,b'La Quinta by Wyndham NW Tucson Marana',2.5,67,"Hotels & Travel, Hotels, Event Planning & Serv..."
150334,LJ4GjQ1HL6kqvIPpNUNNaQ,b'Shanti Yoga and Ayurveda',4.5,39,"Health & Medical, Yoga, Shopping, Naturopathic..."
150336,WnT9NIzQgLlILjPT0kEcsQ,b'Adelita Taqueria & Restaurant',4.5,35,"Restaurants, Mexican"


### Group reviews and businesses into a new Dataframe. (Reviews are aggregated together for each business entry.)

In [81]:
df_review_agg = df.groupby('business_id')['text'].sum()
df_review_agg_new = pd.DataFrame({'business_id': df_review_agg.index, 'all_reviews': df_review_agg.values})
df_review_agg_new

Unnamed: 0,business_id,all_reviews
0,---kPU91CF4Lq2-WlRu9Lw,"b""Ate here for the 1st time on Saturday 08/07/..."
1,--0iUa4sNDFiZFrAdIWhZQ,"b""Very good San Salvadorian place ! Authentic ..."
2,--30_8IhuyMHbSOcNWd6DQ,"b""We stopped going to Action Karate in Decembe..."
3,--7PUidqRWpRSpXebiyxTg,"b'This place is disgusting, and proof that Edm..."
4,--7jw19RH9JKXgFohspgQw,"b'This is the best dentist in the area, hands ..."
...,...,...
150341,zznZqH9CiAznbkV6fXyHWA,"b""Oh my gosh! Yum yum yum! I saw Que Pasta sta..."
150342,zztOG2cKm87I6Iw_tleZsQ,b'This course is exactly what I needed to get ...
150343,zzu6_r3DxBJuXcjnOYVdTw,"b""Probably the best Italian food on the Westba..."
150344,zzw66H6hVjXQEt0Js3Mo4A,b'Went to the Ballwin location this year. The...


### Merge tables
Merge the cleaned up business table with the aggregated reviews file to make one file that contains all the businesses and reviews.

In [82]:
df_ready_for_sklearn = pd.merge(df_business_cleaned, df_review_agg_new, on='business_id')
df_ready_for_sklearn

Unnamed: 0,business_id,name,stars,review_count,categories,all_reviews
0,tUFrWirKiKi_TAnsVWINQQ,b'Target',3.5,22,"Department Stores, Shopping, Fashion, Home & G...",b'We are fans of Target. They seem to have a ...
1,MTSW4McQd7CbVtyjqoe9mw,b'St Honore Pastries',4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","b""This is nice little Chinese bakery in the he..."
2,il_Ro8jwPlHresjw9EGmBg,"b""Denny's""",2.5,28,"American (Traditional), Restaurants, Diners, B...",b'Went there at 4am and there was only one wai...
3,0bPLkL0QhhPO5kt1_EXmNQ,"b""Zio's Italian Market""",4.5,100,"Food, Delis, Italian, Bakeries, Restaurants","b""The worst Chicken Parm. Sandwich I've ever e..."
4,MUTTqe8uqyMdBl186RmNeA,b'Tuna Bar',4.0,245,"Sushi Bars, Restaurants, Japanese",b'Stopped in to check out this new spot around...
...,...,...,...,...,...,...
61914,GeEveoOaU2YKD7jJtEfA_g,b'DeVons Jewelers',5.0,34,"Shopping, Watches, Jewelry","b'After buying my wife her Christmas gift, I s..."
61915,qQ7FHvkGEMqoPKKXPk4gjA,b'La Quinta by Wyndham NW Tucson Marana',2.5,67,"Hotels & Travel, Hotels, Event Planning & Serv...",b'On my way back to San Diego from Atlanta. \n...
61916,LJ4GjQ1HL6kqvIPpNUNNaQ,b'Shanti Yoga and Ayurveda',4.5,39,"Health & Medical, Yoga, Shopping, Naturopathic...","b'(To the tune of ""The Greatest"" by Sia)\n\nUh..."
61917,WnT9NIzQgLlILjPT0kEcsQ,b'Adelita Taqueria & Restaurant',4.5,35,"Restaurants, Mexican",b'I do not know why Adelita is not packed. The...


### Save new data
Save the new dataframe we created to a file.

In [84]:
df_ready_for_sklearn.to_csv(saveDirectory+'business_review_stars.csv', index=False)

# ADD TFIDF Vectorzier, Split Data to train and Test, EarlyStop
# Report the RMSE, plot the lift chart of BEST neural network model.

## TF-IDF Vectorizer

* ***min_df: ignore terms that have a document frequency < min_df.***
* ***max_df: ignore terms that have a document frequency > max_df.***
* ***max_features:  build a vocabulary that only consider the top max_features features ordered by term frequency across the corpus.***

### Note: Im not sure what values to give mid_df, max_df, max_features.


In [None]:
vectorizer = sk_text.TfidfVectorizer(
                             #stop_words='english',
                            max_df=0.50,
                            min_df=10,
                            max_features = 3000)

matrix = vectorizer.fit_transform(df_ready_for_sklearn['all_reviews'])

print(type(matrix))               # Compressed Sparse Row matrix
tfidf_data = matrix.toarray()     #  convert it to numpy array
tfidf_data

## Split data into Training and Test Data.

In [None]:
path = "./data/"
    
filename = os.path.join(path,"iris.csv")    
df = pd.read_csv(filename,na_values=['NA','?'])


species = encode_text_index(df,"species")

x,y = to_xy(df,"species")

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

model = Sequential()

model.add(Dense(10, input_dim=x.shape[1], activation='relu'))
model.add(Dense(5,activation='relu'))
model.add(Dense(y.shape[1],activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')  

# patience: number of epochs with no improvement after which training will be stopped

# The test set is checked during training to monitor progress for early stopping but is never used for gradient descent (model training)

model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000)  


## Plotting Lift Chart