In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow.keras.backend as K

# Step 1 and 2: Data Collection, Preprocessing and Feature Engineering

In [2]:
def clean_movie_title(movie_title: str) -> str:
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

def perf_clean_movie_genre(df: pd.DataFrame) -> pd.DataFrame:
    # Split the 'genres' column into separate rows
    df['Genres'] = df['Genres'].str.split('|')
    
    # Explode the 'genres' column
    df_exploded = df.explode('Genres')
    
    # One-hot encode the 'genres' column
    df_one_hot = pd.get_dummies(df_exploded['Genres'])
    
    # Group by the original index and sum up the one-hot encoded values
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Genres']), df_one_hot_grouped], axis=1)

    return df_combined

def perf_clean_movie_directors(df: pd.DataFrame) -> pd.DataFrame:
    # Split the 'genres' column into separate rows
    df['Directors'] = df['Directors'].str.split(',')
    
    # Explode the 'genres' column
    df_exploded = df.explode('Directors')
    
    # One-hot encode the 'genres' column
    df_one_hot = pd.get_dummies(df_exploded['Directors'])
    
    # Group by the original index and sum up the one-hot encoded values
    df_one_hot_grouped = df_one_hot.groupby(df_exploded.index).sum()
    
    # Combine the one-hot encoded genres with the original dataframe (excluding the old 'genres' column)
    df_combined = pd.concat([df.drop(columns=['Directors']), df_one_hot_grouped], axis=1)

    return df_combined



def build_dataset() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    m_cols = ["MovieId", "Title", "Genres"]
    r_cols = ["UserId", "MovieId", "Rating", "Timestamp"]
    u_cols = ["UserId", "Gender", "Age", "Occupation", "Zip-code"]
    ml_ratings = pd.read_csv("./data/movielens/ratings.dat", sep="::", engine="python", encoding='latin-1', names=r_cols)
    ml_movies = pd.read_csv("./data/movielens/movies.dat", sep="::", engine="python", encoding='latin-1', names=m_cols)
    ml_users = pd.read_csv("./data/movielens/users.dat", sep="::", engine="python", encoding='latin-1', names=u_cols)

    ml_movies["Title"] = ml_movies["Title"].apply(lambda x: clean_movie_title(x))
    ml_movies = perf_clean_movie_genre(ml_movies)

    imdb_names = pd.read_csv("./data/imdb/name.basics.tsv", sep="\t")
    imdb_titles = pd.read_csv("./data/imdb/title.basics.tsv", sep="\t")
    imdb_titles = imdb_titles[imdb_titles["titleType"] == "movie"]
    imdb_names_exploded = imdb_names.assign(knownForTitles=imdb_names['knownForTitles'].str.split(',')).explode('knownForTitles')
    imdb_titles = pd.merge(imdb_names_exploded, imdb_titles, left_on='knownForTitles', right_on='tconst')
    imdb_titles = imdb_titles[imdb_titles['primaryProfession'].str.contains("director")]

    # set it to lowercase because ml_movies["Title"] are also lowercase
    imdb_titles['primaryTitle_lower'] = imdb_titles['primaryTitle'].str.lower()
    directors_subset = imdb_titles[['primaryTitle_lower', 'primaryName']]
    
    merged_df = pd.merge(ml_movies, directors_subset, left_on='Title', right_on='primaryTitle_lower', how='left')
    
    # Drop the 'primaryTitle_lower' column as it is not needed anymore
    merged_df = merged_df.drop(columns=['primaryTitle_lower'])
    final_df = merged_df.groupby('MovieId').agg({
        **{col: 'first' for col in merged_df.columns if col != 'primaryName'},
        'primaryName': lambda x: ', '.join(x.dropna().unique())  # Concatenate director names
    })
    final_df = final_df.rename(columns={"primaryName": "Directors"})
    final_df = final_df.drop(columns=['MovieId']).reset_index()
    # clear memory 
    del merged_df, imdb_titles, imdb_names, ml_movies
    df_movies = perf_clean_movie_directors(final_df)
    

    return ml_ratings, df_movies, ml_users

In [3]:
df_ratings, df_movies, df_users = build_dataset()

  imdb_titles = pd.read_csv("./data/imdb/title.basics.tsv", sep="\t")


In [4]:
df_ratings

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
df_movies

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Yvonne Rainer,Zachary Gamburg,Zane Buzby,Zdenek Fiala,Zelda Barron,Zeljko Antovic,Zerlina Hughes,Zoe R. Cassavetes,Zvika Aloni,Étienne Chatiliez
0,1,toy story,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,waiting to exhale,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_users

Unnamed: 0,UserId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Step 3: Model Development

In [7]:
def prepare_dataframe(df_users: pd.DataFrame, df_movies: pd.DataFrame, df_ratings: pd.DataFrame, nb_samples=10_000) -> pd.DataFrame:
    truncated_df_ratings = df_ratings.drop(columns=["Timestamp"]).sample(n=nb_samples)
    df_movies_ratings = pd.merge(df_movies, truncated_df_ratings, left_on="MovieId", right_on="MovieId", how="left")
    df_movies_ratings = df_movies_ratings[df_movies_ratings['Rating'].notna()]
    df_movies_ratings_users = pd.merge(df_movies_ratings, df_users, left_on="UserId", right_on="UserId", how="left")
    df_final = df_movies_ratings_users.drop(columns=["MovieId", "Title", "UserId", "Zip-code", "Occupation"])
    df_final = df_final.astype({'Rating': 'int32', 'Age': 'int32'})
    df_final["Gender"] = df_final["Gender"].apply(lambda x: 1 if x == "M" else 0)

    return df_final.dropna()

def split_df_into_user_item_rating(df: pd.DataFrame):
    y = df[["Rating"]]
    X = df.drop(columns=["Rating"])

    user = X[["Gender", "Age"]]
    item = X.drop(columns=["Gender", "Age"])

    user = user.to_numpy().astype(np.float64)
    item = item.to_numpy().astype(np.float64)
    y = y.to_numpy().astype(np.float64)
    print("y type:", y.dtype)
    print("y shape:", y.shape)

    return user, item, y

In [8]:
dataset = prepare_dataframe(df_users, df_movies, df_ratings)

In [9]:
dataset

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Zdenek Fiala,Zelda Barron,Zeljko Antovic,Zerlina Hughes,Zoe R. Cassavetes,Zvika Aloni,Étienne Chatiliez,Rating,Gender,Age
0,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,4,1,18
1,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,5,1,25
2,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,45
3,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,5,1,35
4,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,5,1,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,5,1,25
9996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,4,1,56
9997,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,3,1,18
9998,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,4,0,25


In [10]:
for t in dataset.dtypes.items():
    if t[1] != "int64":
        print(t)

('Rating', dtype('int32'))
('Age', dtype('int32'))


In [31]:
df_train, df_test = train_test_split(
    dataset, train_size=0.80, shuffle=True, random_state=1
)

user_train, item_train, y_train = split_df_into_user_item_rating(df_train)
user_test, item_test, y_test = split_df_into_user_item_rating(df_test)

y type: float64
y shape: (8000, 1)
y type: float64
y shape: (2000, 1)


In [32]:
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
y_test = scalerTarget.transform(y_test.reshape(-1, 1))

In [27]:
def build_model(nb_user_features, nb_item_features, output_shape=32):
    tf.random.set_seed(1)
    user_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    item_NN = tf.keras.models.Sequential(
        [
            ### START CODE HERE ###
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(output_shape),
            tf.keras.layers.Lambda(lambda x: K.l2_normalize(x,axis=1), output_shape=(output_shape,1)),
            ### END CODE HERE ###
        ]
    )
    
    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(nb_user_features,))
    vu = user_NN(input_user)
    
    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(nb_item_features,))
    vm = item_NN(input_item)
    
    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])
    
    # specify the inputs and output of the model
    model = tf.keras.Model([input_user, input_item], output)
    
    model.summary()
    return model

In [38]:
model = build_model(user_train.shape[1], item_train.shape[1])

cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn)

model.fit(
    [user_train, item_train],
    y_train,
    epochs=5,
    validation_data=([user_test, item_test], y_test),
)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 67ms/step - loss: 0.2902 - val_loss: 0.2948
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 59ms/step - loss: 0.2170 - val_loss: 0.3075
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 59ms/step - loss: 0.1853 - val_loss: 0.3286
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 60ms/step - loss: 0.1659 - val_loss: 0.3391
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 60ms/step - loss: 0.1566 - val_loss: 0.3493


<keras.src.callbacks.history.History at 0x7fc6ea488590>

In [39]:
y_test

array([[ 1. ],
       [ 0. ],
       [-0.5],
       ...,
       [ 0.5],
       [ 0.5],
       [ 1. ]])

In [40]:
y_train

array([[0. ],
       [0.5],
       [0. ],
       ...,
       [0.5],
       [0. ],
       [1. ]])

In [41]:
y_p = model.predict([user_train, item_train])

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [42]:
y_p

array([[ 0.21532148],
       [ 0.15777045],
       [ 0.2162808 ],
       ...,
       [ 0.4554498 ],
       [-0.18566775],
       [ 0.4608903 ]], dtype=float32)

In [43]:
y_pu = scalerTarget.inverse_transform(y_p)
y_train_u = scalerTarget.inverse_transform(y_train)

In [44]:
y_pu

array([[3.430643 ],
       [3.3155408],
       [3.4325616],
       ...,
       [3.9108996],
       [2.6286645],
       [3.9217806]], dtype=float32)

In [45]:
y_train_u

array([[3.],
       [4.],
       [3.],
       ...,
       [4.],
       [3.],
       [5.]])