In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow.keras.backend as K

2024-07-07 19:02:08.757618: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-07 19:02:08.758181: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 19:02:08.760020: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 19:02:08.767503: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 19:02:08.778355: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

# Step 1 and 2: Data Collection, Preprocessing and Feature Engineering

In [2]:
def clean_movie_title(movie_title: str) -> str:
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

def perf_clean_movie_genre(df: pd.DataFrame) -> pd.DataFrame:
    df['Genres'] = df['Genres'].str.split('|')
    
    df_exploded = df.explode('Genres')
    df_one_hot = pd.get_dummies(df_exploded['Genres'])
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Genres']), df_one_hot_grouped], axis=1)

    return df_combined

def perf_clean_movie_directors(df: pd.DataFrame) -> pd.DataFrame:
    df['Directors'] = df['Directors'].str.split(',')
    
    df_exploded = df.explode('Directors')
    df_one_hot = pd.get_dummies(df_exploded['Directors'])
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Directors']), df_one_hot_grouped], axis=1)

    return df_combined



def build_dataset(with_directors=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    m_cols = ["MovieId", "Title", "Genres"]
    r_cols = ["UserId", "MovieId", "Rating", "Timestamp"]
    u_cols = ["UserId", "Gender", "Age", "Occupation", "Zip-code"]
    ml_ratings = pd.read_csv("./data/movielens/ratings.dat", sep="::", engine="python", encoding='latin-1', names=r_cols)
    ml_movies = pd.read_csv("./data/movielens/movies.dat", sep="::", engine="python", encoding='latin-1', names=m_cols)
    ml_users = pd.read_csv("./data/movielens/users.dat", sep="::", engine="python", encoding='latin-1', names=u_cols)

    ml_movies["Title"] = ml_movies["Title"].apply(lambda x: clean_movie_title(x))
    ml_movies = perf_clean_movie_genre(ml_movies)

    ml_ratings = ml_ratings.drop(columns=["Timestamp"])
    
    if not with_directors:
        return ml_ratings, ml_movies, ml_users
    
    imdb_names = pd.read_csv("./data/imdb/name.basics.tsv", sep="\t")
    imdb_titles = pd.read_csv("./data/imdb/title.basics.tsv", sep="\t")
    imdb_titles = imdb_titles[imdb_titles["titleType"] == "movie"]
    imdb_names_exploded = imdb_names.assign(knownForTitles=imdb_names['knownForTitles'].str.split(',')).explode('knownForTitles')
    imdb_titles = pd.merge(imdb_names_exploded, imdb_titles, left_on='knownForTitles', right_on='tconst')
    imdb_titles = imdb_titles[imdb_titles['primaryProfession'].str.contains("director")]

    # set it to lowercase because ml_movies["Title"] are also lowercase
    imdb_titles['primaryTitle_lower'] = imdb_titles['primaryTitle'].str.lower()
    directors_subset = imdb_titles[['primaryTitle_lower', 'primaryName']]
    
    merged_df = pd.merge(ml_movies, directors_subset, left_on='Title', right_on='primaryTitle_lower', how='left')
    
    # Drop the 'primaryTitle_lower' column as it is not needed anymore
    merged_df = merged_df.drop(columns=['primaryTitle_lower'])
    final_df = merged_df.groupby('MovieId').agg({
        **{col: 'first' for col in merged_df.columns if col != 'primaryName'},
        'primaryName': lambda x: ', '.join(x.dropna().unique())  # Concatenate director names
    })
    final_df = final_df.rename(columns={"primaryName": "Directors"})
    final_df = final_df.drop(columns=['MovieId']).reset_index()
    # clear memory 
    del merged_df, imdb_titles, imdb_names, ml_movies
    df_movies = perf_clean_movie_directors(final_df)
    

    return ml_ratings, df_movies, ml_users

In [3]:
df_ratings, df_movies, df_users = build_dataset(with_directors=False)

In [4]:
df_ratings

Unnamed: 0,UserId,MovieId,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [5]:
df_movies

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,waiting to exhale,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [6]:
df_users

Unnamed: 0,UserId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Step 3 and 5: Model Development and Evaluation

In [7]:
def build_average_genre_ratings(df_movies, df_ratings):
    merged_df = pd.merge(df_ratings, df_movies, on='MovieId')
    genres = list(merged_df.columns.values)[4:]
    for genre in genres:
        merged_df[genre] = merged_df[genre] * merged_df['Rating']
    genre_ratings_sum = merged_df.groupby('UserId')[genres].sum()
    genre_count = merged_df.groupby('UserId')[genres].apply(lambda x: (x > 0).sum())
    average_genre_ratings = genre_ratings_sum / genre_count
    average_genre_ratings = average_genre_ratings.fillna(0)
    average_genre_ratings = average_genre_ratings.reset_index()
    return average_genre_ratings

def prepare_dataframe(
    df_users: pd.DataFrame,
    df_movies: pd.DataFrame,
    df_ratings: pd.DataFrame,
    nb_samples=10_000
    )-> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    truncated_df_ratings = df_ratings.sample(n=nb_samples)
    
    average_genre_ratings = build_average_genre_ratings(df_movies, truncated_df_ratings)
    user_features = truncated_df_ratings.merge(average_genre_ratings, on="UserId")

    #building item features
    item_features = truncated_df_ratings.merge(df_movies, on="MovieId")

    y = user_features[["UserId", "MovieId", "Rating"]]
    user_features = user_features.drop(columns=["Rating"])
    item_features = item_features.drop(columns=["Rating"])

    return user_features, item_features, y

def split_df_into_user_item_rating(df: pd.DataFrame):
    y = df[["Rating"]]
    X = df.drop(columns=["Rating"])

    user = X[["Gender", "Age"]]
    item = X.drop(columns=["Gender", "Age"])

    user = user.to_numpy().astype(np.float64)
    item = item.to_numpy().astype(np.float64)
    y = y.to_numpy().astype(np.float64)
    print("y type:", y.dtype)
    print("y shape:", y.shape)

    return user, item, y

In [8]:
user_features, item_features, y = prepare_dataframe(df_users, df_movies, df_ratings, nb_samples=1_000_000)
u_offset = 2
i_offset = 3
y_offset = 2

In [9]:
user_features

Unnamed: 0,UserId,MovieId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,5954,2613,2.953586,3.027586,3.566038,3.247619,3.217687,3.946429,5.000000,3.551839,3.500000,4.111111,3.046512,3.491228,3.720930,2.946429,3.398649,3.419355,3.730159,3.521739
1,5424,1220,3.454545,3.400000,3.500000,4.000000,4.067797,4.222222,0.000000,3.719298,3.750000,3.000000,4.000000,5.000000,4.666667,3.640000,3.400000,3.840000,3.444444,4.000000
2,5790,919,4.000000,3.933333,5.000000,3.000000,3.709677,3.750000,4.000000,3.530612,3.333333,3.000000,3.800000,4.000000,3.000000,3.555556,3.666667,3.823529,4.000000,4.666667
3,3414,474,4.160221,4.384615,3.833333,3.958333,4.118644,4.032258,4.000000,4.268657,4.322581,4.333333,2.812500,4.333333,3.800000,4.189189,4.157303,3.900990,4.277778,4.142857
4,4186,3418,2.576642,2.684685,2.762712,2.649485,2.769874,3.195122,3.000000,3.186170,2.617647,3.300000,2.867925,2.828571,3.285714,2.986842,2.550336,2.987342,3.027027,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,4786,2702,3.728814,3.562500,3.791667,3.510638,3.612245,3.838710,0.000000,4.019737,3.785714,4.666667,3.416667,3.875000,3.850000,3.734375,3.816901,3.779070,4.166667,3.857143
999996,1748,968,3.343023,3.384615,3.500000,3.533333,3.430894,3.531915,3.333333,3.527027,3.666667,3.904762,3.296296,3.500000,3.695652,3.564516,3.269231,3.578313,3.681818,3.500000
999997,2907,848,3.509615,3.529412,3.250000,3.300000,3.226891,3.172414,0.000000,3.097087,3.678571,3.000000,3.526882,3.000000,3.633333,3.245614,3.607143,3.497238,3.171429,3.615385
999998,5184,454,3.510638,3.000000,3.750000,3.714286,3.810345,4.384615,4.636364,4.007194,0.000000,5.000000,3.923077,4.500000,4.272727,3.904762,3.375000,3.983051,3.250000,4.000000


In [10]:
item_features

Unnamed: 0,UserId,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,5954,2613,night of the comet,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,5424,1220,the blues brothers,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,5790,919,the wizard of oz,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,3414,474,in the line of fire,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4186,3418,thelma & louise,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,4786,2702,summer of sam,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999996,1748,968,night of the living dead,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
999997,2907,848,the spitfire grill,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999998,5184,454,the firm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
y

Unnamed: 0,UserId,MovieId,Rating
0,5954,2613,5
1,5424,1220,5
2,5790,919,5
3,3414,474,5
4,4186,3418,1
...,...,...,...
999995,4786,2702,4
999996,1748,968,2
999997,2907,848,4
999998,5184,454,5


In [12]:
scalerUser = StandardScaler()
scalerUser.fit(user_features.loc[:, "Action":])
user_features.loc[:, "Action":] = scalerUser.transform(user_features.loc[:, "Action":])

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y["Rating"].to_numpy().reshape(-1, 1))
y["Rating"] = scalerTarget.transform(y["Rating"].to_numpy().reshape(-1, 1))

user_train, user_test = train_test_split(
    user_features, train_size=0.80, shuffle=True, random_state=1
)

item_train, item_test = train_test_split(
    item_features, train_size=0.80, shuffle=True, random_state=1
)

y_train, y_test = train_test_split(
    y, train_size=0.80, shuffle=True, random_state=1
)

In [13]:
item_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
521462,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
137361,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
404985,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
910092,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
791624,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
470924,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
491755,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0


In [14]:
user_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,0.906639,0.396418,0.156817,-0.119769,-0.623743,0.806479,0.494418,0.767954,-0.035716,0.616842,0.809263,0.035354,0.904373,-0.028801,0.784273,1.283371,0.714390,0.816536
521462,-0.445766,-1.332914,-1.413630,-1.647057,-1.688766,-1.050969,-0.394037,-4.022222,-0.833974,-1.490477,-1.318130,-1.817053,-1.738539,-2.622943,-1.691550,-1.167402,-2.844965,-0.622244
137361,-1.323583,-0.951284,-0.437732,-0.486721,-1.771308,-1.520688,-1.155569,-1.554853,-0.238295,-0.314379,-1.226803,-0.391248,-0.628496,-1.160545,-1.060172,-1.768109,-1.252518,-0.401227
404985,-0.103965,-0.094471,-0.097333,-0.108917,-0.267840,0.140202,0.841338,0.100647,-0.024462,0.368516,0.163337,0.175058,-0.221315,-0.234128,-0.181353,-0.268652,-0.057394,0.278331
910092,-1.443685,-0.838782,0.540533,0.901501,0.296306,1.275257,0.875184,0.588871,0.483392,0.482332,0.030651,0.572677,0.430799,-0.188745,0.362179,0.440182,-0.135223,1.392531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,1.494034,1.443968,-2.967727,1.820062,1.388424,0.548312,-1.155569,0.096394,1.686205,-2.363065,0.196858,0.120838,-0.710400,0.608807,1.225943,-0.033627,0.297913,-2.419205
791624,-0.131959,0.255104,0.279784,0.301884,-0.328671,-0.244720,-1.155569,0.485192,0.467355,0.823780,-0.102035,0.099321,0.059500,-0.204385,0.320229,-0.132338,0.647754,-0.036870
470924,0.378156,0.942876,0.540533,0.813356,1.104971,0.790627,1.382872,0.975941,0.655223,-2.363065,0.959453,0.647984,0.010357,0.417659,-0.040910,0.821862,0.297913,0.630184
491755,-0.565558,-0.094737,0.203200,0.059486,-0.045709,-0.306919,0.976722,-0.512363,0.031135,0.231937,0.344522,0.443580,-0.280718,-0.283221,-0.491263,-1.048370,-0.270578,0.820771


In [15]:
y_train.loc[:, "Rating":]

Unnamed: 0,Rating
771718,0.5
521462,0.0
137361,-1.0
404985,0.5
910092,0.5
...,...
491263,0.5
791624,1.0
470924,-0.5
491755,0.0


In [20]:
def build_model(nb_user_features, nb_item_features, output_shape=32):
    tf.random.set_seed(1)
    user_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    item_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(nb_user_features,))
    vu = user_NN(input_user)
    
    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(nb_item_features,))
    vm = item_NN(input_item)
    
    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    
    # specify the inputs and output of the model
    model = tf.keras.Model([input_user, input_item], output)
    
    model.summary()
    return model

def train_model(
    model,
    nb_epochs,
    user_train,
    item_train,
    y_train,
    user_test,
    item_test,
    y_test
    ):
    cost_fn = tf.keras.losses.MeanSquaredError()
    opt = keras.optimizers.Adadelta(learning_rate=0.1)
    model.compile(optimizer=opt, loss=cost_fn)
    
    model.fit(
        [user_train.loc[:, "Action":].to_numpy(), item_train.loc[:, "Action":].to_numpy()],
        y_train.loc[:, "Rating":].to_numpy(),
        epochs=nb_epochs,
        validation_data=([user_test.loc[:, "Action":].to_numpy(), item_test.loc[:, "Action":].to_numpy()], y_test.loc[:, "Rating":].to_numpy()),
    )

def use_model(
    model,
    user_features,
    item_features
    ):
    if len(user_features.shape) == 2:
        y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    else:
        user_features = pd.DataFrame([user_features]* item_features.shape[0])
        y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    return y_p

def build_result_comparison(scaler, y_df, y_pred):
    y_pu = scaler.inverse_transform(y_pred)
    y_true_u = scaler.inverse_transform(y_df["Rating"].to_numpy().reshape(-1, 1))
    result = y_df.copy()
    result["Rating"] = y_true_u.flatten()
    result["Prediction"] = y_pu.flatten()
    return result

def build_matrix(df_rating_prediction, value_column):
    return df_rating_prediction.pivot(index="UserId", columns="MovieId", values=value_column)

In [21]:
model = build_model(user_train.shape[1] - u_offset, item_train.shape[1] - i_offset)

In [22]:
train_model(model, 10, user_train, item_train, y_train, user_test, item_test, y_test)

Epoch 1/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 684us/step - loss: 0.2490 - val_loss: 0.2351
Epoch 2/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 667us/step - loss: 0.2349 - val_loss: 0.2322
Epoch 3/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 671us/step - loss: 0.2324 - val_loss: 0.2310
Epoch 4/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 672us/step - loss: 0.2310 - val_loss: 0.2303
Epoch 5/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 666us/step - loss: 0.2300 - val_loss: 0.2299
Epoch 6/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 666us/step - loss: 0.2292 - val_loss: 0.2295
Epoch 7/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 666us/step - loss: 0.2286 - val_loss: 0.2291
Epoch 8/10
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 666us/step - loss: 0.2281 - v

In [23]:
model.summary()

In [25]:
y_pred = use_model(model, user_test, item_test)
df_rating_pred = build_result_comparison(scalerTarget, y_test, y_pred)
df_rating_pred

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 497us/step


Unnamed: 0,UserId,MovieId,Rating,Prediction
276826,881,520,4.0,2.785180
849425,1932,535,4.0,3.953895
504499,530,3703,4.0,3.609518
601054,1146,2710,5.0,3.014936
980221,3201,1100,2.0,3.543831
...,...,...,...,...
555867,26,317,2.0,2.264020
30004,4448,552,4.0,3.660513
124730,2781,3505,4.0,4.194561
195783,2825,1224,4.0,4.190689


In [26]:
matrix_pred = build_matrix(df_rating_pred, "Prediction")
matrix_pred

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [27]:
matrix_true = build_matrix(df_rating_pred, "Rating")
matrix_true

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


# Step 4 Recommandation Algorithm

In [28]:
def build_total_dataset(user_features, item_features):
    n, m = user_features.shape[0], item_features.shape[0]
    item_features = pd.concat([item_features] * n)
    user_features = pd.concat([user_features] * m).sort_values(by='UserId')

    return user_features, item_features

def build_movie_combination(scaler_user_features, df_movies, df_ratings):
    average_genre_ratings = build_average_genre_ratings(df_movies, df_ratings)
    average_genre_ratings.loc[:, "Action":] = scaler_user_features.transform(average_genre_ratings.loc[:, "Action":])
    # Iterate over each user and each movie
    user_features_total, item_features_total = build_total_dataset(average_genre_ratings, df_movies)
    print(user_features_total.shape)
    print(item_features_total.shape)
    return user_features_total, item_features_total

def build_ranking(model, user_features, item_features):
    score = use_model(model, user_features, item_features)
    print(score.shape)
    ranking = pd.DataFrame({'UserId': user_features['UserId'].values, 'MovieId': item_features['MovieId'].values, 'Score': score.flatten()})

    # Sort the rankings by UserId and Score (descending)
    ranking.sort_values(by=['UserId', 'Score'], ascending=[True, False], inplace=True)

    return ranking



In [29]:
user_features, item_features = build_movie_combination(scalerUser, df_movies.sample(n=1_000), df_ratings)

(6038000, 19)
(6038000, 20)


In [30]:
df_ranking = build_ranking(model, user_features, item_features)

[1m188688/188688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 503us/step
(6038000, 1)


In [32]:
df_ranking

Unnamed: 0,UserId,MovieId,Score
341,1,1760,0.718758
539,1,2217,0.718758
854,1,901,0.718758
11,1,2589,0.717519
49,1,3307,0.717519
...,...,...,...
6037633,6040,2735,-0.512855
6037926,6040,3898,-0.551872
6037934,6040,3841,-0.551872
6037969,6040,2457,-0.551872


In [33]:
df_ranking["Score"] = scalerTarget.inverse_transform(df_ranking["Score"].to_numpy().reshape(-1,1)).flatten()

In [34]:
df_ranking

Unnamed: 0,UserId,MovieId,Score
341,1,1760,4.437517
539,1,2217,4.437517
854,1,901,4.437517
11,1,2589,4.435038
49,1,3307,4.435038
...,...,...,...
6037633,6040,2735,1.974291
6037926,6040,3898,1.896255
6037934,6040,3841,1.896255
6037969,6040,2457,1.896255


In [35]:
pred_matrix = build_matrix(df_ranking, "Score")

In [36]:
pred_matrix

MovieId,2,4,12,14,16,19,23,24,31,32,...,3905,3912,3917,3923,3926,3930,3935,3936,3944,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.822382,4.296399,3.245743,4.309218,3.448276,3.845892,3.134791,4.000993,4.309218,4.000993,...,3.845892,4.296399,2.721248,2.831626,3.697791,2.721248,2.721248,3.448276,4.296399,3.448276
2,2.564592,3.814744,2.466671,3.741421,4.254338,3.499077,4.246708,3.353764,3.741421,3.353764,...,3.499077,3.814744,2.211351,2.138833,2.776285,2.211351,2.211351,4.254338,3.814744,4.254338
3,3.648588,3.327338,2.367943,3.487121,3.858065,2.989969,3.821195,3.264520,3.487121,3.264520,...,2.989969,3.327338,2.197452,2.230734,3.274913,2.197452,2.197452,3.858065,3.327338,3.858065
4,3.025158,3.135421,2.315396,3.598292,3.248912,2.209540,2.790928,3.670662,3.598292,3.670662,...,2.209540,3.135421,2.500396,2.683414,3.109889,2.500396,2.500396,3.248912,3.135421,3.248912
5,3.167109,3.204061,3.109635,2.980213,2.987025,3.406974,3.121060,2.834567,2.980213,2.834567,...,3.406974,3.204061,2.504070,2.410522,2.922306,2.504070,2.504070,2.987025,3.204061,2.987025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,2.717702,3.372952,3.289941,3.526967,3.368019,3.014465,3.203602,3.091520,3.526967,3.091520,...,3.014465,3.372952,3.254536,2.852619,2.474385,3.254536,3.254536,3.368019,3.372952,3.368019
6037,4.374063,3.481380,3.545835,3.823262,3.920846,3.222252,3.791687,4.131511,3.823262,4.131511,...,3.222252,3.481380,3.566599,3.756773,4.008386,3.566599,3.566599,3.920846,3.481380,3.920846
6038,2.845770,4.012827,2.941695,3.976601,3.017487,3.543491,2.692497,3.171797,3.976601,3.171797,...,3.543491,4.012827,2.486566,2.235071,2.703461,2.486566,2.486566,3.017487,4.012827,3.017487
6039,3.729540,4.137392,2.516440,4.347345,4.227087,3.511107,3.906387,4.327466,4.347345,4.327466,...,3.511107,4.137392,2.086436,2.445729,3.806448,2.086436,2.086436,4.227087,4.137392,4.227087


In [37]:
nb_user = 0

class UserPreferences:
    def __init__(
        self,
        action=0,
        adventure=0,
        animation=0,
        childrens=0,
        comedy=0,
        crime=0,
        documentary=0,
        drama=0,
        fantasy=0,
        film_noir=0,
        horror=0,
        musical=0,
        mystery=0,
        romance=0,
        sci_fi=0,
        thriller=0,
        war=0,
        western=0,
    ):
        global nb_user
        self.preferences = {
            "UserId": nb_user,
            "Action": action,
            "Adventure": adventure,
            "Animation": animation,
            "Children's": childrens,
            "Comedy": comedy,
            "Crime": crime,
            "Documentary": documentary,
            "Drama": drama,
            "Fantasy": fantasy,
            "Film-Noir": film_noir,
            "Horror": horror,
            "Musical": musical,
            "Mystery": mystery,
            "Romance": romance,
            "Sci-Fi": sci_fi,
            "Thriller": thriller,
            "War": war,
            "Western": western
        }
        nb_user += 1

    def to_df(self):
        return pd.DataFrame(self.preferences, index=[0])

In [42]:
class Recommander:
    def __init__(self, model, scaler_target, scaler_user, df_movies):
        self.model = model
        self.scaler_target = scaler_target
        self.scaler_user = scaler_user
        self.df_movies = df_movies

    def __agg_title(self, df_ranking):
        df_movies = self.df_movies[["MovieId", "Title"]]
        result = df_ranking.merge(df_movies, on="MovieId")
        return result

    def recommand_movie(self, user1: UserPreferences, user2: UserPreferences):
        user_features = pd.concat([user1.to_df(), user2.to_df()], axis=0)
        user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
        user_features, item_features = build_total_dataset(user_features, self.df_movies)
        df_ranking = build_ranking(model, user_features, item_features)
        df_ranking["Score"] = self.scaler_target.inverse_transform(df_ranking["Score"].to_numpy().reshape(-1,1)).flatten()
        df_ranking = self.__agg_title(df_ranking)
        df_avg_ranking = df_ranking.groupby(['MovieId', 'Title'], as_index=False)['Score'].mean().sort_values(by=['Score'], ascending=[False])
        return df_avg_ranking

In [55]:
user1 = UserPreferences(sci_fi=5)
user2 = UserPreferences(sci_fi=3, comedy=5, horror=5)

user1.to_df()

Unnamed: 0,UserId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0


In [56]:
reco = Recommander(model, scalerTarget, scalerUser, df_movies)

df_ranking = reco.recommand_movie(user1, user2)

  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self.scaler_user.transform(user_features.loc[:, "Action":])
  user_features.loc[:, "Action":] = self

[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step
(7766, 1)


In [58]:
df_ranking.head(10)

Unnamed: 0,MovieId,Title,Score
2588,2657,the rocky horror picture show,3.291896
3711,3780,rocketship x-m,3.267457
2457,2526,meteor,3.267457
3306,3375,destination moon,3.267457
2509,2578,the sticky fingers of time,3.267457
3808,3878,x: the unknown,3.267457
1281,1301,forbidden planet,3.267457
1181,1199,brazil,3.267457
1188,1206,a clockwork orange,3.267457
1530,1570,tetsuo ii: body hammer,3.267457
