In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Activation, Dense, Flatten, Input, Embedding, Concatenate, Multiply, Dropout, LSTM, Softmax, Bidirectional, LayerNormalization, BatchNormalization, Conv1D, MaxPooling1D, LeakyReLU
from keras.callbacks import EarlyStopping
from keras.metrics import RootMeanSquaredError
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

## Load data

In [None]:
train_df = pd.read_csv('data/train.csv')
valid_df = pd.read_csv('data/valid.csv')
test_df = pd.read_csv('data/test.csv')
user_df = pd.read_csv('data/user.csv')
business_df = pd.read_csv('data/business.csv')

user_df.rename(columns={'review_count': 'user_review_count', 'name': 'user_name'}, inplace=True)
business_df.rename(columns={'stars': 'business_stars', 'review_count': 'business_review_count', 'name': 'business_name'}, inplace=True)

print(train_df.head())

## Data Stats

In [None]:
print(train_df["stars"].describe())

stars_distribution = train_df["stars"].value_counts().sort_index()

plt.title("Stars Distribution")
plt.bar(stars_distribution.index, stars_distribution.values)
for i in range(len(stars_distribution.index)):
    plt.text(i+1, stars_distribution.values[i], str(stars_distribution.values[i]), ha='center', va='bottom', fontsize=10)
plt.show()

## Data Preprocessing

In [None]:
# Get the unique users and businesses
users = pd.concat([train_df['user_id'], valid_df['user_id'], test_df['user_id']]).unique()
businesses = pd.concat([train_df['business_id'], valid_df['business_id'], test_df['business_id']]).unique()

# Create user and business dictionaries for mapping to integers
user_dict = {user: i for i, user in enumerate(users)}
business_dict = {business: i for i, business in enumerate(businesses)}

########################## training #############################
# Map the users and businesses in the dataframe to integers
train_df['user_id_numeric'] = train_df['user_id'].map(user_dict)
train_df['business_id_numeric'] = train_df['business_id'].map(business_dict)

user_train_X = train_df['user_id_numeric'].values
business_train_X = train_df['business_id_numeric'].values
train_y = train_df['stars'].values
# train_y = keras.utils.to_categorical(train_df['stars'].values-1, num_classes=5)

######################## validation #############################
valid_df['user_id_numeric'] = valid_df['user_id'].map(user_dict)
valid_df['business_id_numeric'] = valid_df['business_id'].map(business_dict)

user_valid_X = valid_df['user_id_numeric'].values
business_valid_X = valid_df['business_id_numeric'].values
valid_y = valid_df['stars'].values
# valid_y = keras.utils.to_categorical(valid_df['stars'].values-1, num_classes=5)

########################## testing ##############################
test_df['user_id_numeric'] = test_df['user_id'].map(user_dict)
test_df['business_id_numeric'] = test_df['business_id'].map(business_dict)

user_test_X = test_df['user_id_numeric'].values
business_test_X = test_df['business_id_numeric'].values
test_y = test_df['stars'].values
# test_y = keras.utils.to_categorical(test_df['stars'].values-1, num_classes=5)

####################### meta_data ###############################
# create meta data
train_merged_df = train_df.merge(business_df, on='business_id')
train_merged_df = train_merged_df.merge(user_df, on='user_id')
valid_merged_df = valid_df.merge(business_df, on='business_id')
valid_merged_df = valid_merged_df.merge(user_df, on='user_id')
test_merged_df = test_df.merge(business_df, on='business_id')
test_merged_df = test_merged_df.merge(user_df, on='user_id')

features = ['business_stars', 'business_review_count', 'is_open', 'user_review_count', \
            'useful', 'funny', 'cool', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', \
            'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', \
            'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos']

meta_train_X = train_merged_df[features].values
meta_valid_X = valid_merged_df[features].values
meta_test_X = test_merged_df[features].values

# normalize meta data
meta_train_X = (meta_train_X - meta_train_X.mean(axis=0)) / meta_train_X.std(axis=0)
meta_valid_X = (meta_valid_X - meta_valid_X.mean(axis=0)) / meta_valid_X.std(axis=0)
meta_test_X = (meta_test_X - meta_test_X.mean(axis=0)) / meta_test_X.std(axis=0)

##############################################################

print(f"user: {len(users)}, business: {len(businesses)}")

## Construct Model

In [None]:
def rmse(pred, actual):
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [None]:
user_input = Input(shape=(1,))
business_input = Input(shape=(1,))
meta_input = Input(shape=(20))

############################### MLP ##################################

# MLP Embedding layers for users and business
mlp_embedding_dim = 10
user_mlp_embedding    = Embedding(input_dim=len(users), output_dim=mlp_embedding_dim)(user_input)
busines_mlp_embedding = Embedding(input_dim=len(businesses), output_dim=mlp_embedding_dim)(business_input)

# Flatten the embeddings
user_mlp_flatten    = Flatten()(user_mlp_embedding)
busines_mlp_flatten = Flatten()(busines_mlp_embedding)

mlp_concatenated = Concatenate()([user_mlp_flatten, busines_mlp_flatten])

# MLP layers
hidden_layer = Dense(64, activation='relu')(mlp_concatenated)
hidden_layer = Dense(64, activation='relu')(hidden_layer)
mlp_vector   = Dense(64, activation='relu')(hidden_layer)

############################### MF ###################################

# MF Embedding layers for users and business
mf_embedding_dim = 10
user_mf_embedding    = Embedding(input_dim=len(users), output_dim=mf_embedding_dim)(user_input)
busines_mf_embedding = Embedding(input_dim=len(businesses), output_dim=mf_embedding_dim)(business_input)
# Flatten the embeddings
user_mf_flatten    = Flatten()(user_mf_embedding)
busines_mf_flatten = Flatten()(busines_mf_embedding)

user_mf_norm    = LayerNormalization()(user_mf_flatten)
busines_mf_norm = LayerNormalization()(busines_mf_flatten)

mf_vector = Multiply()([user_mf_norm, busines_mf_norm])

############################### META ################################
meta_norm    = LayerNormalization()(meta_input)
hidden_layer = Dense(64, activation='relu')(meta_norm)
hidden_layer = Dense(64, activation='relu')(hidden_layer)
meta_vector  = Dense(64, activation='relu')(hidden_layer)

########################### Concatenate ##############################

concatenated = Concatenate()([mf_vector, mlp_vector, meta_vector])
output_layer = Dense(1)(concatenated)

######################################################################

# Create the model
model = Model(inputs=[user_input, business_input, meta_input], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=[RootMeanSquaredError()])

## Training

In [None]:
early_stopping = EarlyStopping(patience=10)
history = model.fit([user_train_X, business_train_X, meta_train_X], train_y, epochs=1, batch_size=64, \
                    validation_data=([user_valid_X, business_valid_X, meta_valid_X], valid_y), callbacks=[early_stopping])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.suptitle('Model training history')
plt.show()

## Prediction

In [None]:
y_pred = model.predict([user_train_X, business_train_X, meta_train_X])
print("Train set RMSE: ", rmse(y_pred, train_y))

y_pred = model.predict([user_valid_X, business_valid_X, meta_valid_X])
print("Validation set RMSE: ", rmse(y_pred, valid_y))

In [None]:
# save prediction to csv
valid_df['stars'] = y_pred
valid_df.drop(["user_id_numeric", "business_id_numeric"], axis=1, inplace=True)
valid_df.to_csv('data/valid_pred.csv', index=False)

In [None]:
y_pred = model.predict([user_test_X, business_test_X, meta_test_X])
test_df['stars'] = y_pred
test_df.drop(["user_id_numeric", "business_id_numeric"], axis=1, inplace=True)
test_df.to_csv('data/test_pred.csv', index=False)

In [None]:
# apply pca
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
meta_train_X_pca = pca.fit_transform(meta_train_X)
meta_valid_X_pca = pca.transform(meta_valid_X)
meta_test_X_pca = pca.transform(meta_test_X)

In [None]:
# plot
plt.scatter(meta_train_X_pca[:, 0], meta_train_X_pca[:, 1], c=train_y, cmap='viridis')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.xlim(-2,10)
plt.ylim(-7,7)
plt.title('PCA on meta data')
plt.colorbar()
plt.show()