In [22]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras

from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine

import os, math, pickle

In [2]:
def merge_shuffle_split(split=1.0):
    movies_df_mod = pd.read_csv('data/movies_mod.csv')
    ratings_df_removed = pd.read_csv('data/ratings_df_last_liked_movie_removed.csv')
    
    # Since ratings_df_removed is the template for merging, it will be shuffled:
    ratings_df_removed = shuffle(ratings_df_removed)
    
    # Selecting a certain range from ratings_df_removed, train + test:
    selection_range = int(len(ratings_df_removed) * (split))
    ratings_df_removed = ratings_df_removed.iloc[: selection_range, :]
    
    # Merging begins:
    ratings_df_removed = ratings_df_removed.merge(movies_df_mod, how= 'left', on= 'movieId').dropna()
    del movies_df_mod


    # Changing the columns names to differentiate between the columns of total_user_like_df and total_user_dislike_df:
    total_user_like_df = pd.read_csv('data/total_user_like_df.csv')

    like_columns = list(total_user_like_df.columns)
    like_columns_modified = []

    for column in like_columns:
        if column == 'userId':
            like_columns_modified.append('userId')
        else:
            modify_column = 'user_like_' + column
            like_columns_modified.append(modify_column)

    total_user_like_df.columns = like_columns_modified

    ratings_df_removed = ratings_df_removed.merge(total_user_like_df, how= 'left', on= 'userId').dropna()
    del total_user_like_df
    

    total_user_dislike_df = pd.read_csv('data/total_user_dislike_df.csv')    

    dislike_columns = list(total_user_dislike_df.columns)
    dislike_columns_modified = []

    for column in dislike_columns:
        if column == 'userId':
            dislike_columns_modified.append('userId')
        else:
            modify_column = 'user_dislike_' + column
            dislike_columns_modified.append(modify_column)

    total_user_dislike_df.columns = dislike_columns_modified

    # Merging all the DFs to create one final DF:
    ratings_df_removed = ratings_df_removed.merge(total_user_dislike_df, how= 'left', on= 'userId').dropna()

    # Removing loaded DFs to save on RAM space:
    del total_user_dislike_df

    movie_tags_df = pd.read_csv('data/final/movie_tags_df.csv')
    ratings_df_removed = ratings_df_removed.merge(movie_tags_df, how= 'left', on= 'movieId').dropna()
    del movie_tags_df

    like_dislike_tags = (pd.read_csv('data/final/like_dislike_tags.csv')).astype('int64')
    ratings_df_removed = ratings_df_removed.merge(like_dislike_tags, how= 'left', on= 'userId').dropna()
    del like_dislike_tags
    
    like_columns_modified.remove('userId')
    dislike_columns_modified.remove('userId')
    like_columns.remove('userId')
    
    genres_like = ratings_df_removed.loc[:, like_columns_modified]
    genres_dislike = ratings_df_removed.loc[:, dislike_columns_modified]
    genres_movie = ratings_df_removed.loc[:, like_columns]
    
    ratings = list(ratings_df_removed.rating)
    
    del ratings_df_removed
    
    return genres_like, genres_dislike, genres_movie, ratings

In [3]:
# Generating the datasets:
genres_like, genres_dislike, genres_movie, ratings = merge_shuffle_split() # Default split of the whole ratings.csv dataset is set to be 50%; already shuffled

train_split = 0.8 # This would be about 80% of dataset
split_index = int(len(ratings) * train_split)

genres_like_train = genres_like.iloc[: split_index, :]
genres_like_test = genres_like.iloc[split_index :, :]
del genres_like # Attempting to save RAM space :)

genres_dislike_train = genres_dislike.iloc[: split_index, :]
genres_dislike_test = genres_dislike.iloc[split_index :, :]
del genres_dislike

genres_movie_train = genres_movie.iloc[: split_index, :]
genres_movie_test = genres_movie.iloc[split_index :, :]
del genres_movie

ratings_scaled_train = np.array(ratings[: split_index])
ratings_scaled_test = np.array(ratings[split_index :])

In [4]:
# Obtain the quantity unique users and movies
ratings_df = pd.read_csv('../datasets/ratings.csv')
movies_df = pd.read_csv('../datasets/movies.csv')

ratings_df.replace(np.nan, '', inplace=True)
movies_df.replace(np.nan, '', inplace=True)

n_users = ratings_df['userId'].nunique()
n_movies = ratings_df['movieId'].nunique()

del ratings_df

In [94]:
max_rating = int(max(ratings_scaled_train))

def custom_activation(x):
    return keras.backend.sigmoid(x) * (max_rating + 0.5)

# User input
user_liked_genres = keras.Input(shape=(genres_like_train.shape[1],))
user_disliked_genres = keras.Input(shape=(genres_dislike_train.shape[1],))

# Movie input
movie_genres = keras.Input(shape=(genres_movie_train.shape[1],))

# Embedding layers
embedding_dim = 64
# hidden layers
hidden_l = [128, 64, 32]
# dropout values
drop=[0.3, 0.2, 0.1]

# Embedding layer for movie
movie_embedding = keras.layers.Embedding(input_dim=n_movies + 1, output_dim=embedding_dim, name='movie_embedding', trainable=True)(movie_genres)
movie_flatten = keras.layers.Flatten()(movie_embedding)

# Concatenate user liked and disliked embeddings
user_genres = keras.layers.Concatenate()([user_liked_genres, user_disliked_genres])
user_first_layer = keras.layers.Dense(64, activation='relu', name='dense_user')(user_genres)
# Embedding layer for user
user_embedding = keras.layers.Embedding(input_dim=n_users + 1, output_dim=embedding_dim, name='user_embedding', trainable=True)(user_liked_genres)
user_flatten = keras.layers.Flatten()(user_embedding)

# Concatenate two previous levels and make a deep NN with Dense layers
output_model = keras.layers.Concatenate()([movie_flatten, user_flatten])

for i in range(0, len(hidden_l)):
    output_model = keras.layers.Dense(hidden_l[i], activation='relu', kernel_initializer='uniform', name='dense__' + str(hidden_l[i]))(output_model)
    output_model = keras.layers.Dropout(drop[i])(output_model)

output_embedding = keras.layers.Dense(1)(output_model)
output_embedding = keras.layers.Activation(custom_activation)(output_embedding)

# Molding the model together
embedding_model = keras.Model(inputs=[user_liked_genres, user_disliked_genres, movie_genres], outputs=output_embedding)

# Compiling the Model:
embedding_model.compile(loss='mean_squared_error', optimizer='adam')

In [95]:
# Models Training:
if os.path.exists('models/') != True: 
    os.mkdir('models/')

batch_size = 512
epochs = 10

# scheduler for learning rate
def scheduler(epoch):
    if epoch < 2:
        return 0.001
    else:
        return 0.001 * math.exp(0.2 * (2 - epoch))

Learning_Rate_Callback = keras.callbacks.LearningRateScheduler(scheduler)

class Save_Progress_Callback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None): # Saving and printing after each epoch
        lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))
        print("Epoch {}, loss is {:5.3f}, validation loss is {:5.3f}, learning rate is {}".format(epoch, logs["loss"],
                                                                                                                   logs["val_loss"], lr))
             
embedding_model.fit(x= [genres_like_train, genres_dislike_train, genres_movie_train], 
                  y= ratings_scaled_train, 
                  epochs= epochs, verbose= 0, batch_size= batch_size, validation_split= 0.1, shuffle= True,
                  callbacks=[Learning_Rate_Callback, Save_Progress_Callback()])

embedding_model.save('models/embedding_model.h5', overwrite= True, include_optimizer= True)

Epoch 0, loss is 1.119, validation loss is 1.068, learning rate is 0.0010000000474974513
Epoch 1, loss is 1.095, validation loss is 1.058, learning rate is 0.0010000000474974513
Epoch 2, loss is 1.087, validation loss is 1.048, learning rate is 0.0010000000474974513
Epoch 3, loss is 1.081, validation loss is 1.045, learning rate is 0.0008187307394109666
Epoch 4, loss is 1.077, validation loss is 1.043, learning rate is 0.0006703200633637607
Epoch 5, loss is 1.074, validation loss is 1.041, learning rate is 0.0005488116294145584
Epoch 6, loss is 1.072, validation loss is 1.040, learning rate is 0.0004493289743550122
Epoch 7, loss is 1.071, validation loss is 1.040, learning rate is 0.0003678794309962541
Epoch 8, loss is 1.070, validation loss is 1.039, learning rate is 0.0003011942026205361
Epoch 9, loss is 1.069, validation loss is 1.038, learning rate is 0.00024659695918671787


In [96]:
embedding_model.evaluate([genres_like_test, genres_dislike_test, genres_movie_test], ratings_scaled_test)



1.0756630897521973

In [97]:
def recommendation_top(user_id, top_k=10, method='scalar'):
    # obtain user and movies embeddings
    user_embeddings = embedding_model.get_layer('user_embedding').get_weights()[0]
    movie_embeddings = embedding_model.get_layer('movie_embedding').get_weights()[0]

    # acquire embedding for our user
    user_embedding = user_embeddings[user_id]

    # we can calculate similarities between embeddings by dot products of vectors due to equal shapes of these vectors
    # or by cosine distance between these vectors
    if method == 'scalar':
        # calcilate similarity between movie and user embeddings
        similarities = np.dot(movie_embeddings, user_embedding)
        # obtain indices the most relevant movies
        top_indices = similarities.argsort()[-top_k:][::-1]
    elif method == 'cosin':
        # calculate cosine distance between embeddings
        similarities = [1 - cosine(movie_embedding, user_embedding) for movie_embedding in movie_embeddings]
        top_indices = np.argsort(similarities)[::-1][:top_k]

    # print recommendations
    recommended_movies = [movies_df['title'][i] for i in top_indices]
    print("Recommended movies by '{}' method for user {}: {}\n".format(method, user_id, recommended_movies))

In [98]:
# obtain recommendations for users
recommendation_top(user_id=123, top_k=10)
recommendation_top(user_id=123, top_k=10, method='cosin')
recommendation_top(user_id=3, top_k=10)
recommendation_top(user_id=250000, top_k=10)

Recommended movies by 'scalar' method for user 123: ['Party 2, The (Boum 2, La) (1982)', 'Steep (2007)', 'At the First Breath of Wind (2003)', 'Slightly French (1949)', 'Cliente (2008)', 'Bitter Harvest (2017)', 'Meet the People (1944)', 'Wonderful Radio (2012)', 'The Issa Valley (1982)', 'Murder, She Baked: A Peach Cobbler Mystery (2016)']

Recommended movies by 'cosin' method for user 123: ['Steep (2007)', 'Cliente (2008)', 'Party 2, The (Boum 2, La) (1982)', 'At the First Breath of Wind (2003)', 'Billy Jack Goes to Washington (1977)', 'The Issa Valley (1982)', 'Public Enemy, The (1931)', 'The Candidate (2011)', 'Scream (1981)', 'Murder, She Baked: A Peach Cobbler Mystery (2016)']

Recommended movies by 'scalar' method for user 3: ['All That I Am (2013)', 'Panorama View, Street Car Motor Room (1904)', 'Canyon, The (2009)', '3x3 Eyes: Legend of the Divine Demon (1995)', 'Finishing School (1934)', 'Trophy (2017)', 'Contrato Vitalício (2016)', 'Tramps (2016)', 'Disorder (2009)', 'F.T.W.

In [99]:
def stats(predictions, true, flex_range= 0.5):
    predictions_list = []
    round_list = np.arange(0.5, 5.5, 0.5)

    for value in predictions:
        value_ori = value
        compare_diff = 99999
        value_round = 0

        for rating in round_list:
            compare_value = abs(value_ori - rating)

            if compare_value < compare_diff: # The absolute difference value that is closest to 0 is the rating the prediction will be rounded to
                compare_diff = compare_value
                value_round = rating

        predictions_list.append(value_round)

    prediction_dict = {'PREDICTION': predictions_list, 'TRUE': list(true)}
    prediction_compare_df = pd.DataFrame(prediction_dict)

    rating_accuracy = 0
    like_dislike_tp = 0  # "Positive" = Like
    like_dislike_tn = 0  # "Negative" = Dislike
    like_dislike_fp = 0
    like_dislike_fn = 0
    prediction_length = len(prediction_compare_df)

    # Making the accuracy definition more flexible by covering a larger range:
    rating_accuracy_flex = 0  # If the prediction was within +/- 0.5 of the actual
    like_dislike_tp_flex = 0  # If the prediction was 3.5+ (instead of 4+), then it is a like
    like_dislike_tn_flex = 0  # If the prediction was 3.0-, then it is a dislike
    like_dislike_fp_flex = 0
    like_dislike_fn_flex = 0

    progress_counter = 0

    for index, row in prediction_compare_df.iterrows():
        predict_like = 0
        true_like = 0

        if row.PREDICTION >= 4:
            predict_like = 1

        if row.TRUE >= 4:
            true_like = 1

        if row.PREDICTION == row.TRUE:  # This is if the exact predicted rating value is the same as the actual value
            rating_accuracy += 1

        if predict_like == true_like:
            if predict_like == 1:  # Don't need to consider true_like to also be 1 since it is assumed it is with the nested if condition
                like_dislike_tp += 1  # True Positive

            else:
                like_dislike_tn += 1  # True Negative

        else:
            if predict_like == 1:
                like_dislike_fp += 1  # False Positive

            else:
                like_dislike_fn += 1 # False Negative

        # FLEX starts:
        predict_like_flex = 0
        true_like_flex = 0

        if row.PREDICTION >= 3.5:
            predict_like_flex = 1

        if row.TRUE >= 3.5:
            true_like_flex = 1

        if row.PREDICTION >= (row.TRUE - flex_range) and row.PREDICTION <= (row.TRUE + flex_range):  
            rating_accuracy_flex += 1

        if predict_like_flex == true_like_flex:
            if predict_like_flex == 1:  
                like_dislike_tp_flex += 1 

            else:
                like_dislike_tn_flex += 1 

        else:
            if predict_like_flex == 1:
                like_dislike_fp_flex += 1 

            else:
                like_dislike_fn_flex += 1 

        progress_counter += 1
        if progress_counter % 100000 == 0:
            print(str(progress_counter / prediction_length * 100) + '%')

    rating_accuracy = rating_accuracy / prediction_length
    like_dislike_accuracy = (like_dislike_tp + like_dislike_tn) / prediction_length

    rating_accuracy_flex = rating_accuracy_flex / prediction_length
    like_dislike_accuracy_flex = (like_dislike_tp_flex + like_dislike_tn_flex) / prediction_length

    print('True Positive: {}, True Negative: {}, False Positive {}, False Negative {}'.format(like_dislike_tp, like_dislike_tn, like_dislike_fp, like_dislike_fn))
    print('Rating Accuracy: {}, Catagorical Accuracy (Like/Dislike) {}'.format(rating_accuracy, like_dislike_accuracy))
    print('------------------------------------------------------------------------------------------------------------')
    print('FLEX True Positive: {}, FLEX True Negative: {}, FLEX False Positive {}, FLEX False Negative {}'.format(like_dislike_tp_flex, like_dislike_tn_flex, like_dislike_fp_flex, like_dislike_fn_flex))
    print('FLEX Rating Accuracy: {}, FLEX Catagorical Accuracy (Like/Dislike) {}'.format(rating_accuracy_flex, like_dislike_accuracy_flex))
    return

In [102]:
model_predictions = (embedding_model.predict(x= [genres_like_test, genres_dislike_test, genres_movie_test]))
stats(model_predictions, ratings_scaled_test)

True Positive: 6539, True Negative: 29324, False Positive 3187, False Negative 25045
Rating Accuracy: 0.16804742959669242, Catagorical Accuracy (Like/Dislike) 0.5595288244012794
------------------------------------------------------------------------------------------------------------
FLEX True Positive: 36356, FLEX True Negative: 4370, FLEX False Positive 20113, FLEX False Negative 3256
FLEX Rating Accuracy: 0.573866916296123, FLEX Catagorical Accuracy (Like/Dislike) 0.6354005772681176


In [103]:
model_predictions_list = []
for prediction in model_predictions:
    model_predictions_list.append(prediction[0])
    
merged_predictions = pd.DataFrame({'genres_model': model_predictions_list, 
                                   'genres_true': list(np.array(list(ratings_scaled_test)))}, 
                                  index= list(range(len(ratings_scaled_test))))

# Using a linear regression for predictions adjustment:
X = merged_predictions.loc[:, ['genres_model']]
y = np.array(merged_predictions.loc[:, 'genres_true']) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

line_reg = LinearRegression(n_jobs= -1).fit(X_train, y_train)
print('Linear Regression R2:', line_reg.score(X_test, y_test))
line_reg_predictions = line_reg.predict(X_test)

# Rounding the predictions that are out of bounds:
line_reg_predictions_rounded = []

for prediction in line_reg_predictions:
    rounded = prediction
    if rounded > 5:
        rounded = 5
    elif rounded < 0.5:
        rounded = 0.5
    
    line_reg_predictions_rounded.append(rounded)
        

stats(line_reg_predictions_rounded, y_test)

Linear Regression R2: 0.054802226201108195
True Positive: 1442, True Negative: 5772, False Positive 713, False Negative 4892
Rating Accuracy: 0.17138622357438177, Catagorical Accuracy (Like/Dislike) 0.5627584054918481
------------------------------------------------------------------------------------------------------------
FLEX True Positive: 7141, FLEX True Negative: 918, FLEX False Positive 4001, FLEX False Negative 759
FLEX Rating Accuracy: 0.5717294640767611, FLEX Catagorical Accuracy (Like/Dislike) 0.6286761837896871
