In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow.keras.backend as K

2024-07-07 16:10:56.489391: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 16:10:56.532758: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 16:10:56.581540: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 16:10:56.630830: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-07 16:10:56.631240: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-07 16:10:56.698825: I tensorflow/core/platform/cpu_feature_guard.cc:

# Step 1 and 2: Data Collection, Preprocessing and Feature Engineering

In [2]:
def clean_movie_title(movie_title: str) -> str:
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

def perf_clean_movie_genre(df: pd.DataFrame) -> pd.DataFrame:
    # Split the 'genres' column into separate rows
    df['Genres'] = df['Genres'].str.split('|')
    
    # Explode the 'genres' column
    df_exploded = df.explode('Genres')
    
    # One-hot encode the 'genres' column
    df_one_hot = pd.get_dummies(df_exploded['Genres'])
    
    # Group by the original index and sum up the one-hot encoded values
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Genres']), df_one_hot_grouped], axis=1)

    return df_combined

def perf_clean_movie_directors(df: pd.DataFrame) -> pd.DataFrame:
    # Split the 'genres' column into separate rows
    df['Directors'] = df['Directors'].str.split(',')
    
    # Explode the 'genres' column
    df_exploded = df.explode('Directors')
    
    # One-hot encode the 'genres' column
    df_one_hot = pd.get_dummies(df_exploded['Directors'])
    
    # Group by the original index and sum up the one-hot encoded values
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Directors']), df_one_hot_grouped], axis=1)

    return df_combined



def build_dataset(with_directors=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    m_cols = ["MovieId", "Title", "Genres"]
    r_cols = ["UserId", "MovieId", "Rating", "Timestamp"]
    u_cols = ["UserId", "Gender", "Age", "Occupation", "Zip-code"]
    ml_ratings = pd.read_csv("./data/movielens/ratings.dat", sep="::", engine="python", encoding='latin-1', names=r_cols)
    ml_movies = pd.read_csv("./data/movielens/movies.dat", sep="::", engine="python", encoding='latin-1', names=m_cols)
    ml_users = pd.read_csv("./data/movielens/users.dat", sep="::", engine="python", encoding='latin-1', names=u_cols)

    ml_movies["Title"] = ml_movies["Title"].apply(lambda x: clean_movie_title(x))
    ml_movies = perf_clean_movie_genre(ml_movies)

    ml_ratings = ml_ratings.drop(columns=["Timestamp"])
    
    if not with_directors:
        return ml_ratings, ml_movies, ml_users
    
    imdb_names = pd.read_csv("./data/imdb/name.basics.tsv", sep="\t")
    imdb_titles = pd.read_csv("./data/imdb/title.basics.tsv", sep="\t")
    imdb_titles = imdb_titles[imdb_titles["titleType"] == "movie"]
    imdb_names_exploded = imdb_names.assign(knownForTitles=imdb_names['knownForTitles'].str.split(',')).explode('knownForTitles')
    imdb_titles = pd.merge(imdb_names_exploded, imdb_titles, left_on='knownForTitles', right_on='tconst')
    imdb_titles = imdb_titles[imdb_titles['primaryProfession'].str.contains("director")]

    # set it to lowercase because ml_movies["Title"] are also lowercase
    imdb_titles['primaryTitle_lower'] = imdb_titles['primaryTitle'].str.lower()
    directors_subset = imdb_titles[['primaryTitle_lower', 'primaryName']]
    
    merged_df = pd.merge(ml_movies, directors_subset, left_on='Title', right_on='primaryTitle_lower', how='left')
    
    # Drop the 'primaryTitle_lower' column as it is not needed anymore
    merged_df = merged_df.drop(columns=['primaryTitle_lower'])
    final_df = merged_df.groupby('MovieId').agg({
        **{col: 'first' for col in merged_df.columns if col != 'primaryName'},
        'primaryName': lambda x: ', '.join(x.dropna().unique())  # Concatenate director names
    })
    final_df = final_df.rename(columns={"primaryName": "Directors"})
    final_df = final_df.drop(columns=['MovieId']).reset_index()
    # clear memory 
    del merged_df, imdb_titles, imdb_names, ml_movies
    df_movies = perf_clean_movie_directors(final_df)
    

    return ml_ratings, df_movies, ml_users

In [3]:
df_ratings, df_movies, df_users = build_dataset(with_directors=False)

In [4]:
df_ratings

Unnamed: 0,UserId,MovieId,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [5]:
df_movies

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,waiting to exhale,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [6]:
df_users

Unnamed: 0,UserId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Step 3: Model Development

In [7]:
def build_average_genre_ratings(df_movies, df_ratings):
    merged_df = pd.merge(df_ratings, df_movies, on='MovieId')
    genres = list(merged_df.columns.values)[4:]
    for genre in genres:
        merged_df[genre] = merged_df[genre] * merged_df['Rating']
    genre_ratings_sum = merged_df.groupby('UserId')[genres].sum()
    genre_count = merged_df.groupby('UserId')[genres].apply(lambda x: (x > 0).sum())
    average_genre_ratings = genre_ratings_sum / genre_count
    average_genre_ratings = average_genre_ratings.fillna(0)
    average_genre_ratings = average_genre_ratings.reset_index()
    return average_genre_ratings

def prepare_dataframe(
    df_users: pd.DataFrame,
    df_movies: pd.DataFrame,
    df_ratings: pd.DataFrame,
    nb_samples=10_000
    )-> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    truncated_df_ratings = df_ratings.sample(n=nb_samples)
    
    average_genre_ratings = build_average_genre_ratings(df_movies, truncated_df_ratings)
    user_features = truncated_df_ratings.merge(average_genre_ratings, on="UserId")

    #building item features
    item_features = truncated_df_ratings.merge(df_movies, on="MovieId")

    y = user_features[["UserId", "MovieId", "Rating"]]
    user_features = user_features.drop(columns=["Rating"])
    item_features = item_features.drop(columns=["Rating"])

    return user_features, item_features, y

def split_df_into_user_item_rating(df: pd.DataFrame):
    y = df[["Rating"]]
    X = df.drop(columns=["Rating"])

    user = X[["Gender", "Age"]]
    item = X.drop(columns=["Gender", "Age"])

    user = user.to_numpy().astype(np.float64)
    item = item.to_numpy().astype(np.float64)
    y = y.to_numpy().astype(np.float64)
    print("y type:", y.dtype)
    print("y shape:", y.shape)

    return user, item, y

In [8]:
user_features, item_features, y = prepare_dataframe(df_users, df_movies, df_ratings, nb_samples=1_000_000)
u_offset = 2
i_offset = 3
y_offset = 2

In [9]:
user_features

Unnamed: 0,UserId,MovieId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,3227,1454,3.355932,3.178571,3.111111,3.227273,3.430851,3.812500,3.333333,4.264706,3.285714,4.333333,3.166667,3.750000,3.875000,3.892473,3.300000,3.911111,3.933333,4.000000
1,4408,1395,3.472727,3.487179,3.714286,3.534884,3.411348,3.892857,4.125000,4.116608,3.375000,3.777778,3.130435,3.744681,3.777778,4.000000,3.111111,3.629032,4.400000,4.181818
2,5615,2795,2.969231,2.880952,3.500000,3.320000,3.581250,3.400000,5.000000,3.585859,3.666667,3.600000,3.272727,3.500000,3.312500,3.375000,3.235294,3.222222,3.833333,4.333333
3,3690,800,2.851485,2.757576,2.727273,2.318182,2.931818,3.333333,0.000000,3.073770,2.000000,3.000000,2.550000,2.500000,3.000000,3.016949,2.681818,2.839695,3.200000,2.666667
4,4725,3197,3.168675,3.198795,3.492537,3.274194,3.132626,3.245614,3.000000,3.205761,3.288889,3.250000,2.986842,3.411765,3.115385,3.214815,3.250000,3.081522,3.288889,3.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,4169,1042,3.207831,3.287356,4.357143,3.633333,3.436834,3.644737,3.857143,3.772176,3.428571,4.309524,3.122905,4.035714,3.606742,3.643077,3.248705,3.461326,3.919355,3.381818
999996,764,1282,4.300000,4.414634,4.962963,4.820513,4.458333,3.500000,0.000000,4.785714,4.750000,5.000000,4.166667,4.900000,0.000000,5.000000,4.100000,5.000000,4.777778,3.500000
999997,4482,157,3.691589,3.698413,4.000000,3.789474,3.707031,3.793103,4.500000,3.936782,3.277778,4.250000,3.333333,3.948718,3.857143,3.811321,3.725490,3.625000,4.068182,3.944444
999998,3471,368,3.178082,3.061224,3.333333,2.965517,3.070064,3.543210,3.000000,3.327586,3.166667,3.571429,2.822222,2.923077,3.333333,3.147826,3.252427,3.211957,3.705882,3.250000


In [10]:
item_features

Unnamed: 0,UserId,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,3227,1454,suburbia,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4408,1395,tin men,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5615,2795,vacation,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3690,800,lone star,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4725,3197,the presidio,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,4169,1042,that thing you do!,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999996,764,1282,fantasia,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
999997,4482,157,canadian bacon,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
999998,3471,368,maverick,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
y

Unnamed: 0,UserId,MovieId,Rating
0,3227,1454,4
1,4408,1395,3
2,5615,2795,3
3,3690,800,2
4,4725,3197,3
...,...,...,...
999995,4169,1042,3
999996,764,1282,5
999997,4482,157,4
999998,3471,368,3


In [12]:
user_train, user_test = train_test_split(
    user_features, train_size=0.80, shuffle=True, random_state=1
)

item_train, item_test = train_test_split(
    item_features, train_size=0.80, shuffle=True, random_state=1
)

y_train, y_test = train_test_split(
    y, train_size=0.80, shuffle=True, random_state=1
)

In [13]:
item_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
521462,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
137361,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
404985,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
910092,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
791624,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
470924,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
491755,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [14]:
user_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,3.645349,3.750000,4.292683,3.837209,3.865574,3.975000,4.000000,4.053381,3.891892,4.285714,3.692308,4.095238,3.714286,4.031746,3.681818,3.858491,4.313725,3.714286
521462,3.470588,3.500000,4.304348,4.000000,3.902174,3.500000,5.000000,4.215686,3.200000,3.000000,4.000000,4.416667,4.000000,4.171429,3.444444,4.000000,4.333333,3.500000
137361,3.396135,3.321101,3.423077,2.984375,2.949843,3.462963,3.333333,3.621262,3.173913,3.833333,2.937500,3.250000,3.517241,3.445205,3.319149,3.463415,4.170213,3.210526
404985,3.000000,2.000000,4.500000,4.000000,3.222222,4.250000,3.750000,3.666667,0.000000,0.000000,0.000000,3.500000,4.500000,3.333333,3.000000,3.636364,4.000000,4.000000
910092,3.500000,3.304348,4.500000,3.500000,3.598039,3.560000,0.000000,3.757576,3.375000,4.000000,2.615385,3.000000,4.000000,3.596154,3.388889,3.310345,4.076923,3.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,3.532258,3.542857,3.333333,3.333333,3.664948,3.578947,4.000000,3.787234,3.500000,3.250000,1.609756,3.363636,3.500000,3.750000,2.827586,3.293103,3.909091,4.000000
791624,4.363636,4.052632,4.300000,4.466667,4.152318,4.791667,4.636364,4.313725,4.500000,4.619048,4.555556,4.166667,4.545455,4.410714,4.350000,4.477273,4.433333,4.285714
470924,3.269663,3.215686,3.125000,3.206897,2.971910,3.108108,3.000000,3.063725,3.277778,4.125000,3.000000,3.517241,3.176471,2.919192,3.200000,3.109589,3.240000,3.666667
491755,3.696000,3.682927,3.500000,3.333333,3.926471,4.000000,4.450000,4.280488,3.625000,4.666667,4.428571,3.428571,4.000000,3.892857,3.459459,3.561644,4.565217,4.250000


In [15]:
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train["Rating"].to_numpy().reshape(-1, 1))
y_train["Rating"] = scalerTarget.transform(y_train["Rating"].to_numpy().reshape(-1, 1))
y_test["Rating"] = scalerTarget.transform(y_test["Rating"].to_numpy().reshape(-1, 1))

In [16]:
y_train.loc[:, "Rating":]

Unnamed: 0,Rating
771718,-0.5
521462,0.5
137361,0.5
404985,0.0
910092,0.5
...,...
491263,1.0
791624,1.0
470924,0.0
491755,1.0


In [17]:
def build_model(nb_user_features, nb_item_features, output_shape=32):
    tf.random.set_seed(1)
    user_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    item_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(nb_user_features,))
    vu = user_NN(input_user)
    
    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(nb_item_features,))
    vm = item_NN(input_item)
    
    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    
    # specify the inputs and output of the model
    model = tf.keras.Model([input_user, input_item], output)
    
    model.summary()
    return model

def train_model(
    model,
    nb_epochs,
    user_train,
    item_train,
    y_train,
    user_test,
    item_test,
    y_test
    ):
    cost_fn = tf.keras.losses.MeanSquaredError()
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(optimizer=opt, loss=cost_fn)
    
    model.fit(
        [user_train.loc[:, "Action":].to_numpy(), item_train.loc[:, "Action":].to_numpy()],
        y_train.loc[:, "Rating":].to_numpy(),
        epochs=5,
        validation_data=([user_test.loc[:, "Action":].to_numpy(), item_test.loc[:, "Action":].to_numpy()], y_test.loc[:, "Rating":].to_numpy()),
    )

def use_model(
    model,
    user_features,
    item_features
    ):
    if len(user_features.shape) == 2:
        y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    else:
        user_features = pd.DataFrame([user_features]* item_features.shape[0])
        y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    return y_p

def build_result_comparison(scaler, y_df, y_pred):
    y_pu = scaler.inverse_transform(y_pred)
    y_true_u = scaler.inverse_transform(y_df["Rating"].to_numpy().reshape(-1, 1))
    result = y_df.copy()
    result["Rating"] = y_true_u.flatten()
    result["Prediction"] = y_pu.flatten()
    return result

def build_matrix(df_rating_prediction, value_column):
    return df_rating_prediction.pivot(index="UserId", columns="MovieId", values=value_column)

In [18]:
model = build_model(user_train.shape[1] - u_offset, item_train.shape[1] - i_offset)

In [19]:
train_model(model, 5, user_train, item_train, y_train, user_test, item_test, y_test)

Epoch 1/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 936us/step - loss: 0.2992 - val_loss: 0.2886
Epoch 2/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 989us/step - loss: 0.2742 - val_loss: 0.2487
Epoch 3/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 937us/step - loss: 0.2519 - val_loss: 0.2482
Epoch 4/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 945us/step - loss: 0.2510 - val_loss: 0.2479
Epoch 5/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 952us/step - loss: 0.2503 - val_loss: 0.2475


In [20]:
y_p = model.predict([user_train.loc[:, "Action":].to_numpy(), item_train.loc[:, "Action":].to_numpy()])

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 620us/step


In [21]:
y_pred = use_model(model, user_test, item_test)
df_rating_pred = build_result_comparison(scalerTarget, y_test, y_pred)
df_rating_pred

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 629us/step


Unnamed: 0,UserId,MovieId,Rating,Prediction
276826,2078,524,5.0,3.862855
849425,4274,2858,1.0,3.063962
504499,533,1213,5.0,3.833035
601054,5026,2088,1.0,2.064482
980221,5164,1242,5.0,4.271376
...,...,...,...,...
555867,4437,2676,3.0,3.201156
30004,5518,1446,3.0,3.791054
124730,5271,2949,4.0,3.468438
195783,3946,2174,5.0,3.678175


In [22]:
matrix_pred = build_matrix(df_rating_pred, "Prediction")
matrix_pred

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [23]:
matrix_true = build_matrix(df_rating_pred, "Rating")
matrix_true

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,,,,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


# Step 4 Recommandation Algorithm

In [24]:
def build_total_dataset(user_features, item_features):
    n, m = user_features.shape[0], item_features.shape[0]
    item_features = pd.concat([item_features] * n)
    user_features = pd.concat([user_features] * m).sort_values(by='UserId')

    return user_features, item_features

def build_movie_combination(model, df_movies, df_ratings):
    average_genre_ratings = build_average_genre_ratings(df_movies, df_ratings)

    # Iterate over each user and each movie
    user_features_total, item_features_total = build_total_dataset(average_genre_ratings, df_movies)
    print(user_features_total.shape)
    print(item_features_total.shape)

    score = use_model(model, user_features_total, item_features_total)
    print(score.shape)
    ranking = pd.DataFrame({'UserId': user_features_total['UserId'], 'MovieId': item_features_total['MovieId'], 'Score': score.flatten()})

    # Sort the rankings by UserId and Score (descending)
    ranking.sort_values(by=['UserId', 'Score'], ascending=[True, False], inplace=True)

    return ranking



In [25]:
df_ranking = build_movie_combination(model, df_movies.sample(n=1_000), df_ratings)

[1m188750/188750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 641us/step


ValueError: array length 6040000 does not match index length 6040