In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow.keras.backend as K

2024-07-07 15:07:00.407285: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 15:07:00.410171: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-07 15:07:00.419567: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 15:07:00.436018: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-07 15:07:00.436080: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-07 15:07:00.448217: I tensorflow/core/platform/cpu_feature_guard.cc:

# Step 1 and 2: Data Collection, Preprocessing and Feature Engineering

In [2]:
def clean_movie_title(movie_title: str) -> str:
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

def perf_clean_movie_genre(df: pd.DataFrame) -> pd.DataFrame:
    # Split the 'genres' column into separate rows
    df['Genres'] = df['Genres'].str.split('|')
    
    # Explode the 'genres' column
    df_exploded = df.explode('Genres')
    
    # One-hot encode the 'genres' column
    df_one_hot = pd.get_dummies(df_exploded['Genres'])
    
    # Group by the original index and sum up the one-hot encoded values
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Genres']), df_one_hot_grouped], axis=1)

    return df_combined

def perf_clean_movie_directors(df: pd.DataFrame) -> pd.DataFrame:
    # Split the 'genres' column into separate rows
    df['Directors'] = df['Directors'].str.split(',')
    
    # Explode the 'genres' column
    df_exploded = df.explode('Directors')
    
    # One-hot encode the 'genres' column
    df_one_hot = pd.get_dummies(df_exploded['Directors'])
    
    # Group by the original index and sum up the one-hot encoded values
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Directors']), df_one_hot_grouped], axis=1)

    return df_combined



def build_dataset(with_directors=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    m_cols = ["MovieId", "Title", "Genres"]
    r_cols = ["UserId", "MovieId", "Rating", "Timestamp"]
    u_cols = ["UserId", "Gender", "Age", "Occupation", "Zip-code"]
    ml_ratings = pd.read_csv("./data/movielens/ratings.dat", sep="::", engine="python", encoding='latin-1', names=r_cols)
    ml_movies = pd.read_csv("./data/movielens/movies.dat", sep="::", engine="python", encoding='latin-1', names=m_cols)
    ml_users = pd.read_csv("./data/movielens/users.dat", sep="::", engine="python", encoding='latin-1', names=u_cols)

    ml_movies["Title"] = ml_movies["Title"].apply(lambda x: clean_movie_title(x))
    ml_movies = perf_clean_movie_genre(ml_movies)
    
    if not with_directors:
        return ml_ratings, ml_movies, ml_users
    
    imdb_names = pd.read_csv("./data/imdb/name.basics.tsv", sep="\t")
    imdb_titles = pd.read_csv("./data/imdb/title.basics.tsv", sep="\t")
    imdb_titles = imdb_titles[imdb_titles["titleType"] == "movie"]
    imdb_names_exploded = imdb_names.assign(knownForTitles=imdb_names['knownForTitles'].str.split(',')).explode('knownForTitles')
    imdb_titles = pd.merge(imdb_names_exploded, imdb_titles, left_on='knownForTitles', right_on='tconst')
    imdb_titles = imdb_titles[imdb_titles['primaryProfession'].str.contains("director")]

    # set it to lowercase because ml_movies["Title"] are also lowercase
    imdb_titles['primaryTitle_lower'] = imdb_titles['primaryTitle'].str.lower()
    directors_subset = imdb_titles[['primaryTitle_lower', 'primaryName']]
    
    merged_df = pd.merge(ml_movies, directors_subset, left_on='Title', right_on='primaryTitle_lower', how='left')
    
    # Drop the 'primaryTitle_lower' column as it is not needed anymore
    merged_df = merged_df.drop(columns=['primaryTitle_lower'])
    final_df = merged_df.groupby('MovieId').agg({
        **{col: 'first' for col in merged_df.columns if col != 'primaryName'},
        'primaryName': lambda x: ', '.join(x.dropna().unique())  # Concatenate director names
    })
    final_df = final_df.rename(columns={"primaryName": "Directors"})
    final_df = final_df.drop(columns=['MovieId']).reset_index()
    # clear memory 
    del merged_df, imdb_titles, imdb_names, ml_movies
    df_movies = perf_clean_movie_directors(final_df)
    

    return ml_ratings, df_movies, ml_users

In [3]:
df_ratings, df_movies, df_users = build_dataset(with_directors=False)

In [4]:
df_ratings

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
df_movies

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,waiting to exhale,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [6]:
df_users

Unnamed: 0,UserId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Step 3: Model Development

In [7]:
def prepare_dataframe(
    df_users: pd.DataFrame,
    df_movies: pd.DataFrame,
    df_ratings: pd.DataFrame,
    nb_samples=10_000
    )-> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    truncated_df_ratings = df_ratings.drop(columns=["Timestamp"]).sample(n=nb_samples)
    
    # building user features
    merged_df = pd.merge(truncated_df_ratings, df_movies, on='MovieId')
    genres = list(merged_df.columns.values)[4:]
    for genre in genres:
        merged_df[genre] = merged_df[genre] * merged_df['Rating']
    genre_ratings_sum = merged_df.groupby('UserId')[genres].sum()
    genre_count = merged_df.groupby('UserId')[genres].apply(lambda x: (x > 0).sum())
    average_genre_ratings = genre_ratings_sum / genre_count
    average_genre_ratings = average_genre_ratings.fillna(0)
    user_features = truncated_df_ratings.merge(average_genre_ratings, on="UserId")

    #building item features
    item_features = truncated_df_ratings.merge(df_movies, on="MovieId")

    y = user_features[["UserId", "MovieId", "Rating"]]
    user_features = user_features.drop(columns=["Rating"])
    item_features = item_features.drop(columns=["Rating"])

    return user_features, item_features, y

def split_df_into_user_item_rating(df: pd.DataFrame):
    y = df[["Rating"]]
    X = df.drop(columns=["Rating"])

    user = X[["Gender", "Age"]]
    item = X.drop(columns=["Gender", "Age"])

    user = user.to_numpy().astype(np.float64)
    item = item.to_numpy().astype(np.float64)
    y = y.to_numpy().astype(np.float64)
    print("y type:", y.dtype)
    print("y shape:", y.shape)

    return user, item, y

In [8]:
user_features, item_features, y = prepare_dataframe(df_users, df_movies, df_ratings, nb_samples=1_000_000)
u_offset = 2
i_offset = 3
y_offset = 2

In [9]:
user_features

Unnamed: 0,UserId,MovieId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,5896,520,1.857143,2.000000,3.000000,2.578947,2.547264,2.571429,0.00,2.727273,2.142857,0.000000,3.166667,2.571429,4.000000,2.736842,2.454545,3.111111,2.000000,4.000000
1,5759,1267,3.475177,3.342466,3.931034,3.612903,3.618785,3.978261,1.00,3.653992,4.000000,4.727273,3.560000,3.880000,3.678571,3.628378,3.512500,3.713115,4.000000,3.375000
2,4637,10,3.538462,3.565217,3.605263,3.397436,3.405594,3.692308,5.00,3.930233,3.285714,3.666667,2.125000,3.892857,4.076923,4.076923,3.292683,3.655738,4.000000,3.750000
3,4024,1343,2.850000,3.000000,4.000000,3.533333,3.309278,3.000000,0.00,3.435294,3.666667,4.666667,2.578947,3.625000,3.625000,3.000000,3.074074,2.653061,4.750000,2.500000
4,710,1079,3.439130,3.510638,3.545455,3.142857,3.310769,3.682353,3.75,3.658730,3.000000,4.500000,3.215909,3.423077,3.428571,3.372093,3.472222,3.433884,3.911765,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3726,3740,3.253165,3.513158,3.730769,3.607143,3.622222,3.321429,3.00,3.509804,3.833333,4.500000,3.666667,3.600000,3.142857,3.705882,3.413462,3.240000,3.736842,3.428571
999996,5359,2193,3.440415,3.387755,3.653061,3.548387,3.744000,3.786885,4.00,3.977273,3.583333,4.000000,3.344828,3.636364,3.904762,3.792453,3.402299,3.644628,3.875000,4.000000
999997,1482,3745,3.333333,3.000000,3.400000,3.200000,3.166667,4.000000,0.00,3.260870,2.000000,4.000000,3.500000,0.000000,3.666667,2.833333,2.571429,3.562500,2.500000,2.000000
999998,1472,2115,3.491429,3.514286,4.300000,4.058824,3.545455,3.409091,0.00,3.706897,3.600000,0.000000,3.250000,4.411765,3.800000,3.657143,3.462963,3.631579,3.809524,3.200000


In [10]:
item_features

Unnamed: 0,UserId,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,5896,520,robin hood: men in tights,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5759,1267,the manchurian candidate,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,4637,10,goldeneye,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4024,1343,cape fear,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,710,1079,a fish called wanda,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3726,3740,big trouble in little china,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999996,5359,2193,willow,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
999997,1482,3745,titan a.e.,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
999998,1472,2115,indiana jones and the temple of doom,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y

Unnamed: 0,UserId,MovieId,Rating
0,5896,520,4
1,5759,1267,5
2,4637,10,3
3,4024,1343,4
4,710,1079,4
...,...,...,...
999995,3726,3740,4
999996,5359,2193,4
999997,1482,3745,3
999998,1472,2115,3


In [12]:
user_train, user_test = train_test_split(
    user_features, train_size=0.80, shuffle=True, random_state=1
)

item_train, item_test = train_test_split(
    item_features, train_size=0.80, shuffle=True, random_state=1
)

y_train, y_test = train_test_split(
    y, train_size=0.80, shuffle=True, random_state=1
)

In [13]:
item_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0
521462,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
137361,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
404985,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
910092,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
791624,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
470924,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
491755,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [14]:
user_train.loc[:, "Action":]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
771718,3.251799,3.236559,3.788462,3.273585,3.486567,4.108108,3.571429,4.102679,3.270270,4.916667,3.519231,3.800000,4.357143,3.885417,3.406250,3.785124,3.880000,3.142857
521462,4.367347,4.380952,4.428571,4.333333,4.419192,4.240000,5.000000,4.358586,4.272727,4.750000,4.300000,4.411765,4.571429,4.250000,4.351852,4.507692,4.629630,4.307692
137361,3.257143,3.081081,4.142857,2.750000,2.692982,2.800000,0.000000,3.222222,2.888889,0.000000,2.750000,3.000000,4.000000,2.571429,3.277778,3.500000,4.000000,2.000000
404985,3.533333,3.571429,4.000000,3.625000,3.357724,4.000000,4.200000,3.603352,3.647059,4.250000,3.633333,3.600000,4.000000,3.402597,3.644444,3.734694,3.636364,3.000000
910092,3.213740,3.148148,3.766667,3.359375,3.177474,3.272727,2.000000,3.772727,2.888889,2.333333,2.833333,3.724138,3.777778,3.683453,3.090909,3.305556,3.393939,3.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491263,2.770701,2.962264,3.777778,3.500000,3.076923,2.680851,0.000000,2.977011,3.333333,4.000000,2.125000,4.000000,2.400000,3.261905,2.759259,2.909091,3.000000,2.200000
791624,3.711111,3.685714,3.800000,4.000000,3.803030,4.000000,4.000000,4.098901,3.666667,3.857143,3.769231,4.055556,3.937500,3.865385,3.644444,3.843137,4.333333,4.000000
470924,4.089286,3.892857,4.250000,3.833333,3.863946,4.083333,4.500000,4.166667,4.142857,0.000000,4.061224,3.166667,3.909091,4.063830,3.857143,4.125000,4.500000,3.875000
491755,3.818182,4.037037,4.000000,3.142857,3.818182,3.461538,0.000000,3.736842,3.285714,5.000000,3.623188,4.250000,2.000000,3.461538,3.857143,3.321429,4.363636,4.000000


In [15]:
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train["Rating"].to_numpy().reshape(-1, 1))
y_train["Rating"] = scalerTarget.transform(y_train["Rating"].to_numpy().reshape(-1, 1))
y_test["Rating"] = scalerTarget.transform(y_test["Rating"].to_numpy().reshape(-1, 1))

In [16]:
y_train.loc[:, "Rating":]

Unnamed: 0,Rating
771718,0.0
521462,0.5
137361,-1.0
404985,0.5
910092,0.0
...,...
491263,0.5
791624,0.5
470924,0.5
491755,1.0


In [24]:
def build_model(nb_user_features, nb_item_features, output_shape=32):
    tf.random.set_seed(1)
    user_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    item_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(nb_user_features,))
    vu = user_NN(input_user)
    
    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(nb_item_features,))
    vm = item_NN(input_item)
    
    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    
    # specify the inputs and output of the model
    model = tf.keras.Model([input_user, input_item], output)
    
    model.summary()
    return model

def train_model(
    model,
    nb_epochs,
    user_train,
    item_train,
    y_train,
    user_test,
    item_test,
    y_test
    ):
    cost_fn = tf.keras.losses.MeanSquaredError()
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(optimizer=opt, loss=cost_fn)
    
    model.fit(
        [user_train.loc[:, "Action":].to_numpy(), item_train.loc[:, "Action":].to_numpy()],
        y_train.loc[:, "Rating":].to_numpy(),
        epochs=5,
        validation_data=([user_test.loc[:, "Action":].to_numpy(), item_test.loc[:, "Action":].to_numpy()], y_test.loc[:, "Rating":].to_numpy()),
    )

def use_model(
    model,
    user_features,
    item_features
    ):
    y_p = model.predict([user_features.loc[:, "Action":].to_numpy(), item_features.loc[:, "Action":].to_numpy()])
    return y_p    

def build_result_comparison(scaler, y_df, y_pred):
    y_pu = scaler.inverse_transform(y_pred)
    y_true_u = scaler.inverse_transform(y_df["Rating"].to_numpy().reshape(-1, 1))
    result = y_df.copy()
    result["Rating"] = y_true_u.flatten()
    result["Prediction"] = y_pu.flatten()
    return result

def build_matrix(df_rating_prediction, value_column):
    return df_rating_prediction.pivot(index="UserId", columns="MovieId", values=value_column)

In [18]:
model = build_model(user_train.shape[1] - u_offset, item_train.shape[1] - i_offset)

In [19]:
train_model(model, 5, user_train, item_train, y_train, user_test, item_test, y_test)

Epoch 1/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 925us/step - loss: 0.2989 - val_loss: 0.2904
Epoch 2/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 936us/step - loss: 0.2893 - val_loss: 0.2889
Epoch 3/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 946us/step - loss: 0.2883 - val_loss: 0.2885
Epoch 4/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 953us/step - loss: 0.2880 - val_loss: 0.2884
Epoch 5/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 957us/step - loss: 0.2878 - val_loss: 0.2883


In [20]:
y_p = model.predict([user_train.loc[:, "Action":].to_numpy(), item_train.loc[:, "Action":].to_numpy()])

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 624us/step


In [21]:
y_pred = use_model(model, user_test, item_test)
df_rating_pred = build_result_comparison(scalerTarget, y_test, y_pred)
df_rating_pred

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 651us/step


Unnamed: 0,UserId,MovieId,Rating,Prediction
276826,5158,3508,5.0,3.878688
849425,419,2394,5.0,3.705214
504499,1835,175,4.0,3.812285
601054,5550,2537,2.0,3.818166
980221,4783,1252,5.0,3.962602
...,...,...,...,...
555867,4117,1722,4.0,3.314896
30004,5184,21,5.0,3.641390
124730,1937,1029,4.0,3.739053
195783,5488,497,5.0,3.579370


In [25]:
matrix_pred = build_matrix(df_rating_pred, "Prediction")
matrix_pred

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,3.757513,,3.567794,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [26]:
matrix_true = build_matrix(df_rating_pred, "Rating")
matrix_true

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,
