# <center> XOKind - Machine Learning/Data Science Intern Interview <center>
## <center> Yelp rating predictions <center>
### <center> Traditional Machine learning Vs Graph Machine Learning <center>

#### Traditional Machine Learning - Multilayer Perceptron Neural Networks

In [None]:
#Importing necessary libraries

import warnings

warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from keras.utils import np_utils

In [None]:
#Function to generate confusion matrix images from confusion matrix array

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(title+".png")

In [None]:
# path to data files

business_json_path = 'dataset/business.json'
review_json_path = 'dataset/review.json'
user_json_path = 'dataset/user.json'


In [None]:
#read business file and extract restaurant data

size = 500000

business = pd.read_json(business_json_path, lines=True,
                    dtype={'business_id':str,'name':str,
                             'address':str,'city':str,
                             'latitude':float,'longitude':float,
                             'state':str,'postal_code':str,
                             'stars':float,'review_count':int,
                             'is_open':int,
                             'attributes':object,'categories':object,
                             'hours':object},
                    chunksize=size)


business_drop_columns = ['name', 'address', 'city', 'state', 'postal_code',
                         'latitude', 'longitude', 'attributes', 'hours']
chunk_list_business = []

for chunk_business in business:
    # Drop columns that aren't needed
    chunk_business = chunk_business.drop(business_drop_columns, axis=1)
    
    # Renaming column name to avoid conflicts
    chunk_business.rename(columns={'stars': 'business_stars', 'review_count': 'business_review_count',
                                      'review_stars': 'business_review_stars'}, inplace=True)
    
    chunk_business = chunk_business[chunk_business['categories'].str.contains('Restaurants', case=True,na=False)]
    
    chunk_list_business.append(chunk_business)

    
df_restaurants = pd.concat(chunk_list_business, ignore_index=True, join='outer', axis=0)

In [None]:
#Delete non-essential data to save memory

del chunk_business
del chunk_list_business

In [None]:
# =============================================================================
# Reviews data
# =============================================================================
size = 500000

review = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

chunk_list = []
for chunk_review in review:
    
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['text', 'date', 'review_id','useful','funny','cool'], axis=1)
    
    # Renaming column name to avoid conflicts
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    
    # Inner merge with edited business file so only reviews related to the restaurants remain
    chunk_merged = pd.merge(df_restaurants, chunk_review, on='business_id', how='inner')
    
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    
    chunk_list.append(chunk_merged)

    
# After trimming down the review file, concatenate all relevant data back to one dataframe
df_restaurant_reviews = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
#Delete non-essential data to save memory

del chunk_review
del chunk_merged
del chunk_list

In [None]:
# =============================================================================
# User data
# =============================================================================

size = 500000

user = pd.read_json(user_json_path, lines=True,
                      dtype={'user_id':str,'name':str,
                             'yelping_since':str,'review_count':int,
                             'friends':object,'useful':int,
                             'funny':int,'cool':int,'fans':int,
                             'elite':list, 'average_stars':float,'compliment_hot':int,
                             'compliment_more':int,'compliment_more':int,'compliment_profile':int,
                             'compliment_cute':int,'compliment_list':int,'compliment_note':int,
                             'compliment_plain':int,'compliment_cool':int,'compliment_funny':int,
                             'compliment_writer':int,'compliment_photos':int},
                      chunksize=size)

user_drop_columns = ['name', 'yelping_since', 'friends']

chunk_list_user = []

for chunk_user in user:
    # Drop columns that aren't needed
    chunk_user = chunk_user.drop(user_drop_columns, axis=1)
    
    # Renaming column name to avoid conflicts
    chunk_user.rename(columns={'review_count': 'user_review_count', 'average_stars': 'user_average_stars'})
    
    chunk_list_user.append(chunk_user)

    
# concatenate to one dataframe
df_user = pd.concat(chunk_list_user, ignore_index=True, join='outer', axis=0)

In [None]:
#Delete non-essential data to save memory

del chunk_user
del chunk_list_user

In [None]:
# Merge users and restaurant reviews data --> this dataframe will contain information about user, restaurant and review

merged_df = df_user.merge(df_restaurant_reviews, how='inner', left_on=["user_id"], right_on=["user_id"])

In [None]:
#Delete non-essential data to save memory

del df_user
del df_restaurant_reviews

In [None]:
# Create a new feature mean compliment score for each users

merged_drop_columns = ['business_id', 'user_id', 'elite']

merged_df.drop(merged_drop_columns, axis = 1, inplace = True)

compliment_columns = ['compliment_cool', 'compliment_cute', 'compliment_funny', 
                               'compliment_hot', 'compliment_list', 'compliment_more',
                               'compliment_note', 'compliment_photos', 'compliment_plain', 
                               'compliment_profile', 'compliment_writer']


merged_df['mean_compliment_score'] = merged_df.loc[: , compliment_columns].mean(axis=1)


merged_df.drop(compliment_columns, axis = 1, inplace = True)

In [None]:
# Expand by restaurant category to investigate restaurent categories and their overall count in data

df_yelp_expand_by_category = merged_df.assign(categories = df_yelp.categories
                         .str.split(', ')).explode('categories')
df_yelp_category_count = df_yelp_expand_by_category.categories.value_counts()


In [None]:
# Selecting top 10 restaurants based on count
top_10_restaurants = list(df_yelp_category_count.index.values)[1:11] #first element is Resturant, so index 1 to 11


df_yelp_top10 = df_yelp_expand_by_category.loc[df_yelp_expand_by_category['categories'].isin(top_10_restaurants)]


In [None]:
# Create One Hot Encoding for categories column

df_yelp_top10_ohe = pd.get_dummies(data = df_yelp_top10, prefix = 'is', 
                                                     columns = ['categories'], drop_first= True, sparse = True)


In [None]:
#Delete non-essential data to save memory

del df_yelp_expand_by_category
del merged_df

In [None]:
#Standardize columns - (x- mean(x))/std(x) --> important for Gradient Descent based algorithms such as MLPNN

cols_to_norm = ['review_count','useful', 'funny', 'cool', 'fans', 'average_stars', 'business_stars', 
                'business_review_count', 'mean_compliment_score']

df_yelp_top10_ohe[cols_to_norm] = df_yelp_top10_ohe[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [None]:
#Create training and testing sets for model training

x_train, x_test, y_train, y_test = train_test_split(df_yelp_top10_ohe.drop(['review_stars'], axis = 1).values, 
                                                    df_yelp_top10_ohe[['review_stars']].values, test_size=0.2, 
                                                    random_state=0)


In [None]:
#converting from object type to float

x_train = x_train.astype(float)
x_test = x_test.astype(float)

In [None]:
#Label encoding and One hot Encoding for target variable

encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)

y_train_ohe = np_utils.to_categorical(encoded_y_train)

In [None]:
# To calculate class weights = to address imbalanced class size

(unique, counts) = np.unique(y_train, return_counts=True)
counts = counts/sum(counts)

inv_counts = 1/counts

class_weights = {}

for i in range(len(unique)):
    class_weights[i] = inv_counts[i]

In [None]:
tf.keras.backend.clear_session()

# Configure a simple MLPNN model with many of the default parameters.

model = tf.keras.models.Sequential([tf.keras.layers.Dense(40, input_dim=x_train.shape[1], activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(20, activation=tf.nn.relu),
                                    tf.keras.layers.Dense(5, activation=tf.nn.softmax)])


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#Model training and saving history of training and validation accuracies

history = model.fit(
    x_train,
    y_train_ohe,
    batch_size=1000,
    epochs=10,
    verbose=1,
    validation_split=0.1,
    class_weight=class_weights)

In [None]:
# Plot training curves
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
#Preparing targets in the test data for performance comparison + Predicting ratings for test data

y_test_encoded = encoder.transform(y_test)
y_test_ohe = np_utils.to_categorical(y_test_encoded)


y_pred = model.predict_classes(x_test)
y_pred += 1 #to match the labels as model outputs values 0-4 instead of 1-5 which label encoder uses


y_pred_encoded = encoder.transform(y_pred)
y_pred_ohe = np_utils.to_categorical(y_pred_encoded)

In [None]:
#To print classification report with metrics such as accuracy, precision, recall and f1-score

target_names = ['1', '2', '3', '4', '5']

print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
#Print confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

In [None]:
plot_confusion_matrix(cm, target_names, normalize = False, title = 'CM_MLPNN')

In [None]:
plot_confusion_matrix(cm, target_names, normalize = True, title = 'CM_MLPNN_normalized')

### <center> END <center>