In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow.keras.backend as K

2024-07-07 21:37:07.226425: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-07 21:37:07.226976: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 21:37:07.228783: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 21:37:07.234703: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 21:37:07.246034: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

# Step 1: Data Collection and Preprocessing 

The first step is to build a proper dataset.
The raw dataset is composed of three dataframes:
- A list of movies with its associated genres as features encoded as a one hot vector
- The list of ratings users have given to films
- A description of the users.

For the movies, we also have the option to add the people that took part in the film as the director and encode this feature as a one hot vector. Unfortunately, This option takes a LOT of memory and the resulting one hot vector would have a dimension of approximaterly 30k. With all these problems, the developer decided it wasn't worth it, as the recommandation algorithm would solely focus on the directors, due to the dimension of the feature.

In [2]:
def clean_movie_title(movie_title: str) -> str:
    """
    This function transform the movie titles from the movielens format into a more standard format.
    The standard format is a lowercase format, where the date is stripped and the article is at its correct place.
    @param movie_title: the title in movielens format
    @returns: the title in standard format
    """
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

def perf_clean_movie_genre(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function transforms the encoding of genres from string to a one hot vector encoding 
    and adds it to the dataframe.
    @param df: the dataframe to process
    @returns: the processed dataframe
    """
    df['Genres'] = df['Genres'].str.split('|')
    
    df_exploded = df.explode('Genres')
    df_one_hot = pd.get_dummies(df_exploded['Genres'])
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Genres']), df_one_hot_grouped], axis=1)

    return df_combined

def perf_clean_movie_directors(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function transforms the encoding of directors from string to a one hot vector encoding 
    and adds it to the dataframe.
    @param df: The dataframe to process
    @returns the processed dataframe.
    """
    df['Directors'] = df['Directors'].str.split(',')
    
    df_exploded = df.explode('Directors')
    df_one_hot = pd.get_dummies(df_exploded['Directors'])
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Directors']), df_one_hot_grouped], axis=1)

    return df_combined



def build_dataset(with_directors=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    This functions builds the three dataframes that constitute the raw dataset.
    @optional_param with_directors=True: Should the movie dataset have the directors as feature ?
    @returns: the ratings, movies, and users dataframes.
    """
    m_cols = ["MovieId", "Title", "Genres"]
    r_cols = ["UserId", "MovieId", "Rating", "Timestamp"]
    u_cols = ["UserId", "Gender", "Age", "Occupation", "Zip-code"]
    ml_ratings = pd.read_csv("./data/movielens/ratings.dat", sep="::", engine="python", encoding='latin-1', names=r_cols)
    ml_movies = pd.read_csv("./data/movielens/movies.dat", sep="::", engine="python", encoding='latin-1', names=m_cols)
    ml_users = pd.read_csv("./data/movielens/users.dat", sep="::", engine="python", encoding='latin-1', names=u_cols)

    ml_movies["Title"] = ml_movies["Title"].apply(lambda x: clean_movie_title(x))
    ml_movies = perf_clean_movie_genre(ml_movies)

    ml_ratings = ml_ratings.drop(columns=["Timestamp"])
    
    if not with_directors:
        return ml_ratings, ml_movies, ml_users
    
    imdb_names = pd.read_csv("./data/imdb/name.basics.tsv", sep="\t")
    imdb_titles = pd.read_csv("./data/imdb/title.basics.tsv", sep="\t")
    imdb_titles = imdb_titles[imdb_titles["titleType"] == "movie"]
    imdb_names_exploded = imdb_names.assign(knownForTitles=imdb_names['knownForTitles'].str.split(',')).explode('knownForTitles')
    imdb_titles = pd.merge(imdb_names_exploded, imdb_titles, left_on='knownForTitles', right_on='tconst')
    imdb_titles = imdb_titles[imdb_titles['primaryProfession'].str.contains("director")]

    # set it to lowercase because ml_movies["Title"] are also lowercase
    imdb_titles['primaryTitle_lower'] = imdb_titles['primaryTitle'].str.lower()
    directors_subset = imdb_titles[['primaryTitle_lower', 'primaryName']]
    
    merged_df = pd.merge(ml_movies, directors_subset, left_on='Title', right_on='primaryTitle_lower', how='left')
    
    # Drop the 'primaryTitle_lower' column as it is not needed anymore
    merged_df = merged_df.drop(columns=['primaryTitle_lower'])
    final_df = merged_df.groupby('MovieId').agg({
        **{col: 'first' for col in merged_df.columns if col != 'primaryName'},
        'primaryName': lambda x: ', '.join(x.dropna().unique())  # Concatenate director names
    })
    final_df = final_df.rename(columns={"primaryName": "Directors"})
    final_df = final_df.drop(columns=['MovieId']).reset_index()
    # clear memory 
    del merged_df, imdb_titles, imdb_names, ml_movies
    df_movies = perf_clean_movie_directors(final_df)
    

    return ml_ratings, df_movies, ml_users

In [3]:
df_ratings, df_movies, df_users = build_dataset(with_directors=False)

In [4]:
df_ratings

Unnamed: 0,UserId,MovieId,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [5]:
df_movies

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,waiting to exhale,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [6]:
df_users

Unnamed: 0,UserId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Step 2: Feature Engineering

From this raw dataset, we can build a more advanced and engineered dataset in order to create the features we need for our recommandation algorithm. The idea is to build a content based filtering algorithm through user features that we'd like to engineer and the film features that have been built during the first step.

Through our "per user, per film, ratings" dataset, we can compute the average rating of each user for each respective genre.
That way, we engineer $18$ unique features for each user.

Finally, we build a huge training dataset composed of three dataframes where each row corresponds to data associated to the rating $r$ of a film $m$ by a user $u$.

Finally, we normalize the user features with a standard scaler, and the ratings with a min max scaler. The item features are already normalized (i.e. data is already between $0$ and $1$).

In [7]:
def build_average_genre_ratings(df_movies, df_ratings):
    """
    This function produce a dataframe that contains the average rating per genre of all known users.
    Genres that have not been rated by users will have a 0 as a rating.
    @param df_movies: the raw movie dataframe
    @param df_ratings: the raw ratings dataframe
    @returns: the average ratings per genre dataframe
    """
    merged_df = pd.merge(df_ratings, df_movies, on='MovieId')
    genres = list(merged_df.columns.values)[4:]
    for genre in genres:
        merged_df[genre] = merged_df[genre] * merged_df['Rating']
    genre_ratings_sum = merged_df.groupby('UserId')[genres].sum()
    genre_count = merged_df.groupby('UserId')[genres].apply(lambda x: (x > 0).sum())
    average_genre_ratings = genre_ratings_sum / genre_count
    average_genre_ratings = average_genre_ratings.fillna(0)
    average_genre_ratings = average_genre_ratings.reset_index()
    return average_genre_ratings

def prepare_dataframe(
    df_users: pd.DataFrame,
    df_movies: pd.DataFrame,
    df_ratings: pd.DataFrame,
    nb_samples=10_000
    )-> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    Transform the two users and movies dataframe by merging them with the ratings dataframe into
    three different dataframes where each row correspond to data linked to a certain rating r:
    - a dataframe containing the engineered user features (i.e. the average ratings per genre dataframe)
      of the user that gave a rating.
    - a dataframe containing the item features of the film that has been rated. 
    - a dataframe containing the rating r that a user u has given to a film m.
    @param df_users: the raw user dataframe
    @param df_movies: the raw movies dataframe
    @param df_ratings: the raw ratings dataframe
    @returns: the three engineered dataframes.
    """
    truncated_df_ratings = df_ratings.sample(n=nb_samples)
    
    average_genre_ratings = build_average_genre_ratings(df_movies, truncated_df_ratings)
    user_features = truncated_df_ratings.merge(average_genre_ratings, on="UserId")

    #building item features
    item_features = truncated_df_ratings.merge(df_movies, on="MovieId")

    y = user_features[["UserId", "MovieId", "Rating"]]
    user_features = user_features.drop(columns=["Rating"])
    item_features = item_features.drop(columns=["Rating"])

    return user_features, item_features, y

In [8]:
user_features, item_features, y = prepare_dataframe(df_users, df_movies, df_ratings, nb_samples=1_000_000)
# Those offset will be used to tell our recommandation algorithm to not train on the UserIDs, MovieIDs or the movie title...
u_offset = 2
i_offset = 3
y_offset = 2

In [9]:
user_features

Unnamed: 0,UserId,MovieId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,888,2085,3.689655,3.384615,4.133333,4.000000,3.580645,3.000000,0.000000,4.454545,2.666667,0.000000,2.666667,4.200000,0.000000,3.250000,3.866667,3.687500,3.750000,0.000000
1,1509,849,3.000000,3.000000,2.857143,2.592593,3.187500,3.095238,0.000000,3.020408,2.700000,4.333333,2.611111,2.600000,2.833333,3.100000,3.075269,2.819672,3.411765,2.000000
2,3693,333,3.381944,3.211268,3.538462,3.222222,3.350467,3.538462,4.000000,3.649038,3.526316,4.272727,3.195122,3.450000,3.451613,3.366667,3.329268,3.581197,3.777778,3.687500
3,3590,3055,4.024096,4.159091,4.447368,4.145833,4.059701,4.416667,4.000000,4.239264,3.800000,4.625000,4.147059,4.380952,4.434783,4.078125,4.113636,4.195402,4.562500,4.333333
4,5098,3088,3.083333,2.952381,4.000000,3.250000,3.226415,3.555556,0.000000,3.107692,3.142857,3.500000,2.772727,3.150000,3.666667,3.444444,3.000000,3.228571,3.368421,2.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3900,497,3.680000,3.700000,3.857143,3.742857,3.469466,4.038462,1.500000,4.105882,3.923077,4.000000,3.250000,3.692308,3.555556,3.737500,3.297297,3.712121,3.823529,3.666667
999996,5077,2915,3.403846,3.705882,3.500000,4.000000,3.310078,3.800000,4.222222,3.548913,3.250000,4.294118,3.347826,4.333333,3.724138,3.549020,3.526316,3.744681,3.684211,4.250000
999997,2386,1009,4.000000,3.705882,3.580645,3.631579,3.818182,3.500000,0.000000,4.000000,3.545455,0.000000,0.000000,3.608696,0.000000,4.000000,3.666667,4.166667,4.000000,0.000000
999998,3095,2724,3.857143,3.833333,4.933333,4.733333,3.925532,3.555556,0.000000,3.833333,3.600000,4.000000,3.750000,5.000000,4.333333,3.872549,4.000000,3.812500,3.500000,3.000000


In [10]:
item_features

Unnamed: 0,UserId,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,888,2085,101 dalmatians,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1509,849,escape from l.a.,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,3693,333,tommy boy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3590,3055,felicia's journey,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,5098,3088,harvey,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3900,497,much ado about nothing,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
999996,5077,2915,risky business,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999997,2386,1009,escape to witch mountain,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
999998,3095,2724,runaway bride,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
y

Unnamed: 0,UserId,MovieId,Rating
0,888,2085,4
1,1509,849,3
2,3693,333,2
3,3590,3055,3
4,5098,3088,3
...,...,...,...
999995,3900,497,4
999996,5077,2915,4
999997,2386,1009,4
999998,3095,2724,4


In [12]:
# Data normalization through the definition of two Scaler objects that will be reused during Step 4

scalerUser = StandardScaler()
scalerUser.fit(user_features.loc[:, "Action":])
user_features.loc[:, "Action":] = scalerUser.transform(user_features.loc[:, "Action":])

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y["Rating"].to_numpy().reshape(-1, 1))
y["Rating"] = scalerTarget.transform(y["Rating"].to_numpy().reshape(-1, 1))

user_train, user_test = train_test_split(
    user_features, train_size=0.80, shuffle=True, random_state=1
)

item_train, item_test = train_test_split(
    item_features, train_size=0.80, shuffle=True, random_state=1
)

y_train, y_test = train_test_split(
    y, train_size=0.80, shuffle=True, random_state=1
)

In [13]:
item_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
521462,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
137361,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
404985,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
910092,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
791624,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
470924,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
491755,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [14]:
user_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,-0.596213,-0.483007,-0.067396,-0.025749,-0.615466,-0.764311,0.875330,-0.864061,-0.839814,0.466072,-0.433722,0.030419,0.430379,-0.104374,-0.815188,0.202992,-0.102554,-0.270548
521462,-1.488476,-1.699361,-1.096014,-1.366798,-1.027074,-2.362472,-1.155574,-1.760719,-1.200679,-0.314368,-0.433722,-1.287852,-1.251525,-0.602057,-1.177745,-0.173748,-0.460742,0.376503
137361,1.529230,1.206743,0.500646,0.614405,1.365254,1.310852,0.621467,1.097885,0.998877,0.880681,0.959471,0.830798,1.122928,1.261120,1.487496,1.379753,0.788581,0.325663
404985,0.086222,0.207989,0.364930,0.071410,0.123534,0.457497,0.621467,0.299068,0.242780,0.482331,-1.027344,0.346698,-0.198025,-0.136532,-0.119616,-0.409283,0.395648,0.058755
910092,0.494200,0.487410,0.700229,0.289011,0.772802,-0.907004,-1.155574,1.118979,-0.045912,0.709959,-0.155084,0.572611,0.550515,1.494816,0.881928,0.583376,0.550614,-2.419682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,0.021743,-0.327496,0.383748,0.190117,-0.436443,0.220984,0.494536,-0.156769,0.065513,0.466072,0.027903,0.181028,0.152169,-0.432144,0.214875,-0.055488,0.073029,0.561375
791624,0.241604,0.689889,1.008917,1.256126,1.680445,0.710183,1.179966,0.701641,1.325373,0.709959,0.773712,1.054560,1.186529,1.547703,0.654304,0.483341,0.764555,-0.131894
470924,-0.653665,-0.460191,0.415111,0.962768,-0.705267,0.548464,-1.155574,0.087488,0.723932,0.368517,-0.951194,0.120785,-0.067327,0.233345,-0.880420,-0.601267,-0.745187,-0.640291
491755,-1.472640,-0.869026,-0.630321,-1.395381,-1.695925,-0.227786,0.367604,-0.948093,-0.375845,0.368517,-0.011270,-0.443999,-0.327980,-1.665856,-0.609329,-1.290005,-0.056200,-0.339875


In [15]:
y_train.loc[:, "Rating":]

Unnamed: 0,Rating
771718,1.0
521462,0.5
137361,0.5
404985,0.0
910092,0.0
...,...
491263,0.5
791624,1.0
470924,-0.5
491755,-1.0


# Step 3 and 5: Model Development and Evaluation

The dataset is now ready to be used to train a recommandation algorithm.
As said already, The idea is to build a content based filtering algorithm. The technology used here is a Neural Net encoder. This choice is motivated by:
- The huge quantity of data and the high variance in the given data, although it has been preprocessed.
- The potential good performance of the neural net.

This choice has nonetheless some flaws:
- It is a black box system, meaning it is highly difficult to explain the choices made by the recommandation algorithm
- There is no formal theory on how to build the most performant neural net, and choices are often made through try and retry.

We use the same architecture as course number 8, mainly there is:
- One encoder for the user features that embeds them into a latent space of dimension 32
- One encoder for the item features that embeds them into a latent space of dimension 32
- Those two 32 dimensional latent vector are then processed into one scalar through a dot product, thus producing our predicted rating.

The training is done with a AdaDelta optimizer and a learning rate of 0.1, this has been chosen through empirical testing (try and retry) and give us the best results.

The evaluation of the model performance is done with the mean squared error loss, as we are doing a regression task. 

Convergence of the model stops at nb_epochs=10 approximately.

In [16]:
def build_model(nb_user_features, nb_item_features, output_shape=32):
    """
    This functions builds the model by connecting two feature encoder through a dot product.
    @param nb_user_features: the input size of the user features encoder
    @param nb_item_features: the input size of the item features encoder
    @optional_param output_shape=32: the latent space dimension of both encoders. 
    @returns: the model
    """
    tf.random.set_seed(1)
    user_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    item_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(nb_user_features,))
    vu = user_NN(input_user)
    
    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(nb_item_features,))
    vm = item_NN(input_item)
    
    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    
    # specify the inputs and output of the model
    model = tf.keras.Model([input_user, input_item], output)
    
    model.summary()
    return model

def train_model(
    model,
    nb_epochs,
    user_train,
    item_train,
    y_train,
    user_test,
    item_test,
    y_test
    ):
    """
    Use this function to train the built model with the engineered dataset.
    @param model: the model to train
    @param nb_epochs: the number of epochs to train
    @param user_train: the user features for training
    @param item_train: the item features for training
    @param y_train: the ratings associated to each user and film for training
    @param user_test: the user features for validation
    @param item_test: the item features for validation
    @param y_test: the ratings associated to each user and film for validation
    """
    cost_fn = tf.keras.losses.MeanSquaredError()
    opt = keras.optimizers.Adadelta(learning_rate=0.1)
    model.compile(optimizer=opt, loss=cost_fn)
    
    model.fit(
        [user_train.loc[:, "Action":].to_numpy(), item_train.loc[:, "Action":].to_numpy()],
        y_train.loc[:, "Rating":].to_numpy(),
        epochs=nb_epochs,
        validation_data=([user_test.loc[:, "Action":].to_numpy(), item_test.loc[:, "Action":].to_numpy()], y_test.loc[:, "Rating":].to_numpy()),
    )

def use_model(
    model,
    user_features,
    item_features
    ):
    """
    Use this function to predict the rating of a users on films through the model.
    @param model: the model to use for prediction
    @param user_features: the user features to use
    @param item_features: the item features to use
    @returns: the rating prediction scaled between -1 and 1
    """
    if len(user_features.shape) == 2:
        y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    else:
        user_features = pd.DataFrame([user_features]* item_features.shape[0])
        y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    return y_p

def build_result_comparison(scaler, y_df, y_pred):
    """
    This function rescales the rating predictions and the true rating between 0 and 5
    and provides a dataframe in order to compare the prediction and verita tera.
    @param scaler: the target scaler used for ratings
    @param y_df: the dataframe containing the true ratings
    @param y_pred: a numpy array containing the predicted ratings.
    @returns: the dataframe containing the true ratings and the predicted ratings for a certain user for a certain movie.
    """
    y_pu = scaler.inverse_transform(y_pred)
    y_true_u = scaler.inverse_transform(y_df["Rating"].to_numpy().reshape(-1, 1))
    result = y_df.copy()
    result["Rating"] = y_true_u.flatten()
    result["Prediction"] = y_pu.flatten()
    return result

def build_matrix(df_rating_prediction: pd.DataFrame, value_column: str):
    """
    This function applies a pivot on a given prediction / true rating comparison dataframe.
    @param df_rating_prediction: the prediction / true rating comparison dataframe
    @param value_column: the column name on which to apply the pivot.
    @returns: the matrix to build.
    """
    return df_rating_prediction.pivot(index="UserId", columns="MovieId", values=value_column)

In [17]:
model = build_model(user_train.shape[1] - u_offset, item_train.shape[1] - i_offset)

2024-07-07 21:37:13.164447: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:134] retrieving CUDA diagnostic information for host: HPC
2024-07-07 21:37:13.164462: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:141] hostname: HPC
2024-07-07 21:37:13.164509: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:165] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2024-07-07 21:37:13.164532: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:169] kernel reported version is: 550.78.0


In [18]:
train_model(model, 10, user_train, item_train, y_train, user_test, item_test, y_test)

Epoch 1/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 666us/step - loss: 0.2484 - val_loss: 0.2369
Epoch 2/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 659us/step - loss: 0.2346 - val_loss: 0.2339
Epoch 3/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 658us/step - loss: 0.2322 - val_loss: 0.2329
Epoch 4/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 667us/step - loss: 0.2309 - val_loss: 0.2319
Epoch 5/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 659us/step - loss: 0.2298 - val_loss: 0.2313
Epoch 6/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 657us/step - loss: 0.2290 - val_loss: 0.2309
Epoch 7/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 659us/step - loss: 0.2283 - val_loss: 0.2307
Epoch 8/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 658us/step - loss: 0.2279 - v

Here we can visualize under different formats the resulting prediction of rating on known rating data from users.
Notice that the matrix visualisation is mostly empty, as we only have $200\,000$ ratings for the test dataset, although the total number of ratings is $3952 \times 6040 = 23\,870\,080$

In [19]:
y_pred = use_model(model, user_test, item_test)
df_rating_pred = build_result_comparison(scalerTarget, y_test, y_pred)
df_rating_pred

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 484us/step


Unnamed: 0,UserId,MovieId,Rating,Prediction
276826,5053,3700,4.0,3.990972
849425,4150,1753,2.0,3.166739
504499,1779,316,3.0,3.323310
601054,4763,223,5.0,3.908052
980221,2736,589,3.0,3.495172
...,...,...,...,...
555867,3191,2006,4.0,3.286885
30004,346,3176,4.0,4.359455
124730,1260,1625,5.0,4.028029
195783,3953,610,3.0,3.128305


In [20]:
matrix_pred = build_matrix(df_rating_pred, "Prediction")
matrix_pred

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [21]:
matrix_true = build_matrix(df_rating_pred, "Rating")
matrix_true

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


# Step 4 Recommandation Algorithm

Now we have a model that has been trained and evaluated. 
The next task is now to build a list of predicted ratings for all films based on a set of users.
As a demonstration, we perform the building of this *ranking* dataframe for all users: this means we predict the potential rating of all users for all films.

In [22]:
def build_movie_combination(user_features, item_features):
    """
    Let n be the number of users and m be the number of films
    This function duplicate two dataframes such that:
    - The first user dataframe contains rows that are duplicated m times and consecutive
    - The second movie dataframe is duplicated n times.
    @param user_features: the user features to duplicate
    @param item_features: the movie features to duplicate
    @returns: the duplicated dataframes.
    """
    n, m = user_features.shape[0], item_features.shape[0]
    item_features = pd.concat([item_features] * n)
    user_features = pd.concat([user_features] * m).sort_values(by='UserId')

    return user_features, item_features

def build_total_dataset(scaler_user_features, df_movies, df_ratings):
    """
    This function build two dataframes representing all the user / movie combinations into two separate features dataframes.
    @param scaler_user_features: The scaler to use for user features
    @param df_movies: the raw movie dataset
    @param df_ratings: the raw ratings dataset
    @returns: the two user and item features dataframes.
    """
    average_genre_ratings = build_average_genre_ratings(df_movies, df_ratings)
    average_genre_ratings.loc[:, "Action":] = scaler_user_features.transform(average_genre_ratings.loc[:, "Action":])
    # Iterate over each user and each movie
    user_features_total, item_features_total = build_movie_combination(average_genre_ratings, df_movies)
    print(user_features_total.shape)
    print(item_features_total.shape)
    return user_features_total, item_features_total

def build_ranking(model, user_features, item_features):
    """
    Builds the predicted ranking based on a set of user features and movie features.
    @param model: the model to use to predict the rating per user per movie
    @param user_features: the user features to use for prediction
    @param item_features: the movie features to use for prediction
    @returns: the dataframe containing the rank for each movie.
    """
    score = use_model(model, user_features, item_features)
    print(score.shape)
    ranking = pd.DataFrame({'UserId': user_features['UserId'].values, 'MovieId': item_features['MovieId'].values, 'Score': score.flatten()})

    # Sort the rankings by UserId and Score (descending)
    ranking.sort_values(by=['UserId', 'Score'], ascending=[True, False], inplace=True)

    return ranking



In [24]:
user_features, item_features = build_total_dataset(scalerUser, df_movies.sample(n=1_000), df_ratings)

(6040000, 19)
(6040000, 20)


In [25]:
df_ranking = build_ranking(model, user_features, item_features)
df_ranking["Score"] = scalerTarget.inverse_transform(df_ranking["Score"].to_numpy().reshape(-1,1)).flatten()

[1m188750/188750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 488us/step
(6040000, 1)


In [26]:
df_ranking

Unnamed: 0,UserId,MovieId,Score
119,1,2248,4.259354
286,1,2626,4.259354
419,1,3261,4.259354
439,1,2065,4.259354
681,1,2675,4.259354
...,...,...,...
6039305,6040,2817,1.895603
6039429,6040,2815,1.895603
6039532,6040,2816,1.895603
6039729,6040,2402,1.895603


In [27]:
pred_matrix = build_matrix(df_ranking, "Score")

In [28]:
pred_matrix

MovieId,3,4,5,6,10,15,16,25,26,31,...,3908,3909,3914,3918,3926,3932,3941,3942,3943,3950
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.888718,4.161451,3.665744,2.548746,2.432085,2.908931,3.426098,4.073470,4.151712,4.151712,...,2.213942,3.888718,4.151712,2.213942,2.932570,2.154317,2.213942,2.213942,3.665744,4.151712
2,3.783127,4.085642,3.687731,2.922987,3.108650,3.262770,3.593608,3.820638,3.955624,3.955624,...,1.781421,3.783127,3.955624,1.781421,2.967306,1.858865,1.781421,1.781421,3.687731,3.955624
3,4.039932,4.210083,3.847205,3.170720,3.222610,3.608079,3.739215,4.136844,4.179150,4.179150,...,3.188869,4.039932,4.179150,3.188869,2.511202,2.709830,3.188869,3.188869,3.847205,4.179150
4,2.380970,3.380052,2.353376,3.291991,3.543450,2.830528,3.970418,3.230552,3.773340,3.773340,...,2.624833,2.380970,3.773340,2.624833,2.734985,2.624288,2.624833,2.624833,2.353376,3.773340
5,3.607537,3.388695,3.844968,2.743842,2.387386,2.937497,2.630787,3.065598,3.100065,3.100065,...,2.979842,3.607537,3.100065,2.979842,3.231830,2.900438,2.979842,2.979842,3.844968,3.100065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3.386654,3.303588,3.017362,3.335898,2.855095,2.989473,3.267435,3.476487,3.477265,3.477265,...,2.708318,3.386654,3.477265,2.708318,3.080509,2.747773,2.708318,2.708318,3.017362,3.477265
6037,3.845151,4.186108,3.789155,3.230346,2.959249,3.143759,3.939436,4.013860,4.241399,4.241399,...,3.835738,3.845151,4.241399,3.835738,3.550686,3.708874,3.835738,3.835738,3.789155,4.241399
6038,3.899988,4.038621,3.537409,2.363217,2.305897,2.825176,3.266492,4.100365,4.005147,4.005147,...,2.431047,3.899988,4.005147,2.431047,2.662978,2.222178,2.431047,2.431047,3.537409,4.005147
6039,3.941175,4.122696,3.738987,3.063546,2.827995,2.994782,4.012868,4.124186,4.160441,4.160441,...,3.612756,3.941175,4.160441,3.612756,3.927257,3.724045,3.612756,3.612756,3.738987,4.160441


# Step 6: Let's make a demonstration !!

All raw systems are ready and needs now to be combined into a final object: The Recommander.
The Recommander takes as input the genre preferences of two users and returns the average of the movie ranking of both user.
We take the average as it computes the distance middle of both evaluation, thus providing a high score when the two users when the two have a high predicted score for a certain film (e.g. a 5 and 5), and a low score when the two users do not produce a score in agreement with each other (e.g. a 1 and a 5)

For this reason, the recommanding system accurately provide a movie that both user can like.

In [29]:
nb_user = 0

class UserPreferences:
    def __init__(
        self,
        action=0,
        adventure=0,
        animation=0,
        childrens=0,
        comedy=0,
        crime=0,
        documentary=0,
        drama=0,
        fantasy=0,
        film_noir=0,
        horror=0,
        musical=0,
        mystery=0,
        romance=0,
        sci_fi=0,
        thriller=0,
        war=0,
        western=0,
    ):
        global nb_user
        self.preferences = {
            "UserId": nb_user,
            "Action": action,
            "Adventure": adventure,
            "Animation": animation,
            "Children's": childrens,
            "Comedy": comedy,
            "Crime": crime,
            "Documentary": documentary,
            "Drama": drama,
            "Fantasy": fantasy,
            "Film-Noir": film_noir,
            "Horror": horror,
            "Musical": musical,
            "Mystery": mystery,
            "Romance": romance,
            "Sci-Fi": sci_fi,
            "Thriller": thriller,
            "War": war,
            "Western": western
        }
        nb_user += 1

    def to_df(self):
        return pd.DataFrame(self.preferences, index=[0])

In [33]:
class Recommander:
    def __init__(self, model, scaler_target, scaler_user, df_movies):
        self.model = model
        self.scaler_target = scaler_target
        self.scaler_user = scaler_user
        self.df_movies = df_movies

    def __agg_title(self, df_ranking):
        df_movies = self.df_movies[["MovieId", "Title"]]
        result = df_ranking.merge(df_movies, on="MovieId")
        return result

    def recommand_movie(self, user1: UserPreferences, user2: UserPreferences):
        """
        Returns a movie tier-list based on user preferences,
        by building the ranking for the two user and compute the average.
        @param user1: the preferences of the first user
        @param user2: the preferences of the second user
        """
        user_features = pd.concat([user1.to_df(), user2.to_df()], axis=0)
        user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
        user_features, item_features = build_movie_combination(user_features, self.df_movies)
        df_ranking = build_ranking(model, user_features, item_features)
        df_ranking["Score"] = self.scaler_target.inverse_transform(df_ranking["Score"].to_numpy().reshape(-1,1)).flatten()
        df_ranking = self.__agg_title(df_ranking)
        df_avg_ranking = df_ranking.groupby(['MovieId', 'Title'], as_index=False)['Score'].mean().sort_values(by=['Score'], ascending=[False])
        return df_avg_ranking

In [40]:
user1 = UserPreferences(sci_fi=5, horror=5)
user2 = UserPreferences(sci_fi=3, comedy=5, horror=5)

user1.to_df()

Unnamed: 0,UserId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,6,0,0,0,0,0,0,0,0,0,0,5,0,0,0,5,0,0,0


In [41]:
reco = Recommander(model, scalerTarget, scalerUser, df_movies)

df_ranking = reco.recommand_movie(user1, user2)

[1m  1/243[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 12ms/step

  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self

[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 508us/step
(7766, 1)


In [42]:
# TADAAA
df_ranking.head(10)

Unnamed: 0,MovieId,Title,Score
2588,2657,the rocky horror picture show,3.824878
2677,2746,little shop of horrors,3.761892
3627,3696,night of the creeps,3.73737
3760,3830,psycho beach party,3.669331
2414,2483,"day of the beast, the (el día de la bestia)",3.669331
3466,3535,american psycho,3.669331
3417,3486,devil girl from mars,3.599467
3711,3780,rocketship x-m,3.599467
3710,3779,project moon base,3.599467
3285,3354,mission to mars,3.599467
