# Setting up businesses database and filtering by the review count

In [None]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn import metrics

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

bussinesses = pd.read_json('yelp_academic_dataset_business.json', lines=True)

bus_filtered = bussinesses[bussinesses['review_count'] > 20]    #remove restuarants that have less than 20 reviews
bus_filtered = bus_filtered.sample(n=10000, random_state=42)  #reduce to 10k
bus_filtered = bus_filtered[['business_id', 'name', 'stars', 'review_count']]



*italicized text*#Setting up review database and filtering it based on the filtered bussinesses


In [None]:

reviews = pd.read_json('yelp_academic_dataset_review.json', lines=True)

rev_filtered = reviews[reviews['business_id'].isin(bus_filtered['business_id'])]


rev_filtered = rev_filtered[['review_id', 'business_id', 'text']]

#save both results to new json files
rev_filtered.to_json('filtered_reviews.json', orient='records', lines=True)
bus_filtered.to_json("filtered_businesses.json", orient="records", lines=True)


# Load the json files

In [None]:
 rev_filtered = pd.read_json('content/filtered_reviews.json', lines=True)
 bus_filtered = pd.read_json('content/filtered_businesses.json', lines=True)

# Group the review text with its corresponding business


In [None]:

#Grouping the business_id with all the review text that corresponds to that business
grouped_reviews = rev_filtered.groupby('business_id')['text'].apply(' '.join).reset_index()
grouped_reviews


# Calculate TF-IDF

In [None]:
#Possibly create our own vocabulary

vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.9, min_df = 0.01)
tfidf_matrix = vectorizer.fit_transform(grouped_reviews['text'])
tfidf_feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_feature_names)

#tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=grouped_reviews['business_id'], columns=tfidf.get_feature_names_out())

tfidf_df

# Filter Top 10,000 words based on TF-IDF score

In [None]:
avg_tfidf_scores = tfidf_df.mean(axis=0)   #get average scores

sorted_tfidf = avg_tfidf_scores.sort_values(ascending=False)    #sort
top_10000_words = sorted_tfidf.head(10000).index
top_tfidf_df = tfidf_df[top_10000_words]
top_tfidf_df['business_id'] = grouped_reviews['business_id']
top_tfidf_df

# Add the columns to the merged data frame

In [None]:
text_df = grouped_reviews[['business_id', 'text']]
#grouped_reviews.drop('text', axis=1, inplace=True)
merged_df = pd.merge(bdf, rdf, on='business_id')
merged_df.drop('review_id', axis=1, inplace=True)
merged_df.drop('text', axis=1, inplace=True)
merged_df = merged_df.drop_duplicates(subset='business_id', keep='first')

merged_df = pd.merge(merged_df, top_tfidf_df, on='business_id')

In [None]:
merged_df = merged_df.sort_values('business_id')
merged_df

In [None]:
text_df

Done datasets!!

# Methods Library

In [None]:
from collections.abc import Sequence

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

def drop_two_columns(df, column1 = 'business_id', column2 = 'name'):
    df.drop(column1, axis=1, inplace=True)
    df.drop(column2, axis=1, inplace=True)
    return df

### Split train/test data && Create Model





In [None]:
#menage data frame to create better model
names = merged_df['name']
df = drop_two_columns(merged_df)
missing_median(df, 'stars_x')
missing_median(df, 'review_count')

encode_numeric_zscore(df, 'review_count')

In [None]:
x,y = to_xy(df, "stars_x")

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size=0.25, random_state=45)

for i in range(5):
    model = Sequential()
    input_size = x_train.shape[1]

    #Input layer
    model.add(Dense(64, input_dim=input_size, activation='relu'))
    model.add(Dense(32, activation='relu'))
    #Output layer
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam')

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')
    checkpointer = ModelCheckpoint(filepath="dnn/relu64adam.keras", verbose=0, save_best_only=True) # save best model

    model.fit(x_train, y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=1000)

model.load_weights('dnn/relu64adam.keras') # load weights from best

## Important!! change the file name every time we test with different hyperparameters

In [None]:
# Predict and measure RMSE
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Score (RMSE): {}".format(score))

In [None]:
# print out prediction
df_y = pd.DataFrame(y_test, columns=['ground_truth'])
df_pred = pd.DataFrame(pred, columns=['predicted'])
result = pd.concat([df_y, df_pred],axis=1)
result

# Visualizing (Regression Models)

In [None]:
#Imports and Methods
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, show

# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

In [None]:
# Plot the chart
chart_regression(pred.flatten(),y_test, sort=True)

# Prediction with 5 businesses

In [None]:
# Sample predictions
for i in range(5):
    print("{}. Business Name: {}, Stars: {}, predicted Stars: {}".format(i+1,names[i],y_test[i],pred[i]))