# Recommender System - Training Notebook

This notebook contains the steps required to train a Recommendation System with Neural Networks. 

*Disclaimer: This notebook is developed with content from* **DeepLearning.AI** *'s ML Specialization Course.*

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dot
from keras.losses import MeanSquaredError
from keras.optimizers import Adam
from keras.ops import normalize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Training Data Preparation

In [2]:
file_path = "Data/training_data/"

users_data = pd.read_csv(file_path+"x_users_data.csv")
users_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   War          100836 non-null  float64
 1   Thriller     100836 non-null  float64
 2   Sci-Fi       100836 non-null  float64
 3   Documentary  100836 non-null  float64
 4   Romance      100836 non-null  float64
 5   Horror       100836 non-null  float64
 6   Western      100836 non-null  float64
 7   Film-Noir    100836 non-null  float64
 8   Adventure    100836 non-null  float64
 9   Action       100836 non-null  float64
 10  Musical      100836 non-null  float64
 11  Mystery      100836 non-null  float64
 12  Crime        100836 non-null  float64
 13  Fantasy      100836 non-null  float64
 14  Unknown      100836 non-null  float64
 15  IMAX         100836 non-null  float64
 16  Comedy       100836 non-null  float64
 17  Animation    100836 non-null  float64
 18  Children     100836 non-

In [3]:
x_movies_data = pd.read_csv(file_path+"x_movies_data.csv")
movies_data = x_movies_data.drop(["movieID"], axis=1)
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            100836 non-null  float64
 1   average_rating  100836 non-null  float64
 2   War             100836 non-null  float64
 3   Thriller        100836 non-null  float64
 4   Sci-Fi          100836 non-null  float64
 5   Documentary     100836 non-null  float64
 6   Romance         100836 non-null  float64
 7   Horror          100836 non-null  float64
 8   Western         100836 non-null  float64
 9   Film-Noir       100836 non-null  float64
 10  Adventure       100836 non-null  float64
 11  Action          100836 non-null  float64
 12  Musical         100836 non-null  float64
 13  Mystery         100836 non-null  float64
 14  Crime           100836 non-null  float64
 15  Fantasy         100836 non-null  float64
 16  Unknown         100836 non-null  float64
 17  IMAX      

In [4]:
ratings_data = pd.read_csv(file_path+"y_data.csv")
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 1 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   Ratings  100836 non-null  float64
dtypes: float64(1)
memory usage: 787.9 KB


### Feature Scaling

In [5]:
## Scaling User Data

users_scaler = StandardScaler()
users_scaler.fit(users_data)

users_data_scaled = users_scaler.transform(users_data)
users_data_scaled

array([[1.08215047, 1.33022989, 1.53248009, ..., 1.35184514, 1.52724654,
        1.93760734],
       [1.08215047, 1.33022989, 1.53248009, ..., 1.35184514, 1.52724654,
        1.93760734],
       [1.08215047, 1.33022989, 1.53248009, ..., 1.35184514, 1.52724654,
        1.93760734],
       ...,
       [0.14698461, 0.2056934 , 0.48698599, ..., 0.51791421, 0.43636113,
        0.5253731 ],
       [0.14698461, 0.2056934 , 0.48698599, ..., 0.51791421, 0.43636113,
        0.5253731 ],
       [0.14698461, 0.2056934 , 0.48698599, ..., 0.51791421, 0.43636113,
        0.5253731 ]])

In [6]:
## Scaling Movies Data

movies_scaler = StandardScaler()
movies_scaler.fit(movies_data)

movies_data_scaled = movies_scaler.transform(movies_data)
movies_data_scaled

array([[ 0.03009026,  0.74046737, -0.22500381, ...,  3.66468004,
         3.15450666, -0.8436551 ],
       [ 0.03009026, -0.42764262, -0.22500381, ..., -0.27287512,
        -0.31700678, -0.8436551 ],
       [ 0.03009026,  0.79356328, -0.22500381, ..., -0.27287512,
        -0.31700678, -0.8436551 ],
       ...,
       [ 0.77281057,  0.22720692, -0.22500381, ..., -0.27287512,
        -0.31700678, -0.8436551 ],
       [ 0.77281057,  1.37761827, -0.22500381, ..., -0.27287512,
        -0.31700678, -0.8436551 ],
       [ 0.77281057, -2.07361579, -0.22500381, ..., -0.27287512,
        -0.31700678,  1.18531851]])

In [7]:
## Scaling Ratings

ratings_scaler = MinMaxScaler()
ratings_scaler.fit(ratings_data)

ratings_data_scaled = ratings_scaler.transform(ratings_data)
ratings_data_scaled

array([[0.77777778],
       [0.77777778],
       [0.77777778],
       ...,
       [1.        ],
       [1.        ],
       [0.55555556]])

In [8]:
## Conversion Check

print(np.allclose(users_data, users_scaler.inverse_transform(users_data_scaled)))
print(np.allclose(movies_data, movies_scaler.inverse_transform(movies_data_scaled)))
print(np.allclose(ratings_data, ratings_scaler.inverse_transform(ratings_data_scaled)))

True
True
True


## Training-Test Data Split

In [9]:
users_train, users_test = train_test_split(users_data_scaled, test_size=0.2, random_state=10, shuffle=True)
movies_train, movies_test = train_test_split(movies_data_scaled, test_size=0.2, random_state=10, shuffle=True)
ratings_train, ratings_test = train_test_split(ratings_data_scaled, test_size=0.2, random_state=10, shuffle=True)

In [10]:
print("Training Data Shape [Users, Movies, Ratings]:", users_train.shape, movies_train.shape, ratings_train.shape)
print("Testing Data Shape [Users, Movies, Ratings]:", users_test.shape, movies_test.shape, ratings_test.shape)

Training Data Shape [Users, Movies, Ratings]: (80668, 20) (80668, 22) (80668, 1)
Testing Data Shape [Users, Movies, Ratings]: (20168, 20) (20168, 22) (20168, 1)


## Model Creation

In [11]:
users_input_shape = users_data.shape[1]
movies_input_shape = movies_data.shape[1]
output_size = 32

In [12]:
tf.random.set_seed(10)

users_nn_layers = Sequential([
    Dense(256, activation="relu"),
    Dense(128, activation="relu"),
    Dense(output_size, activation="linear")
])

movies_nn_layers = Sequential([
    Dense(256, activation="relu"),
    Dense(128, activation="relu"),
    Dense(output_size, activation="linear")
])

user_input_layer = Input(shape=(users_input_shape, ))
users_model = users_nn_layers(user_input_layer)
users_model = normalize(users_model, axis=1)

movie_input_layer = Input(shape=(movies_input_shape, ))
movies_model = movies_nn_layers(movie_input_layer)
movies_model = normalize(movies_model, axis=1)

output_layer = Dot(axes=1)([users_model, movies_model])

model_recsys = Model([user_input_layer, movie_input_layer], output_layer)

model_recsys.summary()

In [13]:
cost_fn = MeanSquaredError()
optimizer = Adam(learning_rate=0.01)

model_recsys.compile(optimizer=optimizer, loss = cost_fn)
model_recsys.summary()

In [14]:
model_recsys.fit([users_train, movies_train], ratings_train, epochs=30)

Epoch 1/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5ms/step - loss: 0.0374
Epoch 2/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0325
Epoch 3/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.0319
Epoch 4/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.0313
Epoch 5/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.0308
Epoch 6/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.0304
Epoch 7/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.0300
Epoch 8/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.0297
Epoch 9/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.0294
Epoch 10/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x2f103b19c10>

In [15]:
model_recsys.evaluate([users_test, movies_test], ratings_test)

[1m631/631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.0294


0.029710063710808754

In [16]:
movies_inf_layer = Input(shape=(movies_input_shape, ))
me_model = movies_nn_layers(movies_inf_layer)
me_model = normalize(me_model, axis=1)
movies_encoder = Model(movies_inf_layer, me_model)
movies_encoder.summary()

In [17]:
users_inf_layer = Input(shape=(users_input_shape, ))
ue_model = users_nn_layers(users_inf_layer)
ue_model = normalize(ue_model, axis=1)
users_encoder = Model(users_inf_layer, ue_model)
users_encoder.summary()

In [18]:
## Saving the model

model_recsys.save("models/main_model.keras")
movies_encoder.save("models/movies_encoder.keras")
users_encoder.save("models/users_encoder.keras")

In [19]:
## Saving the Scaling Objects

import pickle

scalers = {
    "users": users_scaler,
    "movies": movies_scaler,
    "ratings": ratings_scaler
}

with open("models/scalers.pickle", "wb") as file:
    pickle.dump(scalers, file)

## Inference

### New User

In [20]:
users_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   War          100836 non-null  float64
 1   Thriller     100836 non-null  float64
 2   Sci-Fi       100836 non-null  float64
 3   Documentary  100836 non-null  float64
 4   Romance      100836 non-null  float64
 5   Horror       100836 non-null  float64
 6   Western      100836 non-null  float64
 7   Film-Noir    100836 non-null  float64
 8   Adventure    100836 non-null  float64
 9   Action       100836 non-null  float64
 10  Musical      100836 non-null  float64
 11  Mystery      100836 non-null  float64
 12  Crime        100836 non-null  float64
 13  Fantasy      100836 non-null  float64
 14  Unknown      100836 non-null  float64
 15  IMAX         100836 non-null  float64
 16  Comedy       100836 non-null  float64
 17  Animation    100836 non-null  float64
 18  Children     100836 non-

In [21]:
genres = list(users_data.columns)
genres

['War',
 'Thriller',
 'Sci-Fi',
 'Documentary',
 'Romance',
 'Horror',
 'Western',
 'Film-Noir',
 'Adventure',
 'Action',
 'Musical',
 'Mystery',
 'Crime',
 'Fantasy',
 'Unknown',
 'IMAX',
 'Comedy',
 'Animation',
 'Children',
 'Drama']

In [22]:
movies = pd.read_csv("Data/MovieLens/ml-latest-small/movies.csv")
ratings = pd.read_csv("Data/MovieLens/ml-latest-small/ratings.csv")

In [23]:
def fetch_movie_row(movie, user_ratings, ratings_row, ratings_count):
    movie_id = movie["movieId"]
    movie_genres = movie["genres"].split("|")
    rating_row = user_ratings.loc[user_ratings["movieId"] == movie_id]
    curr_rating = rating_row["rating"]
    for genre in movie_genres:
        idx = genres.index(genre)
        ratings_row[idx] += curr_rating
        ratings_count[idx] += 1

def calculate_user_vector(user_ratings = None, uid = None):
    if(user_ratings is None):
        user_ratings = ratings.loc[ratings["userId"] == uid]
    rated_movies = movies.loc[movies["movieId"].isin(user_ratings["movieId"])]
    ratings_arr_row = np.zeros(users_data.shape[1])
    ratings_count_arr = np.zeros(users_data.shape[1])
    rated_movies.apply(lambda x: fetch_movie_row(x, user_ratings, ratings_arr_row, ratings_count_arr), axis=1)
    # print("Ratings Row:", ratings_arr_row)
    # print("Ratings Count:", ratings_count_arr)
    result = np.divide(ratings_arr_row, ratings_count_arr, out=np.zeros(users_data.shape[1]), where=ratings_count_arr!=0)
    # print("Calculation:", result)
    return np.round(result, 2)

In [48]:
import time

new_user_ratings = [
    [-1, 89745, 4.5, int(time.time() * 1000)],
    [-1, 122892, 3.5, int(time.time() * 1000)],
    [-1, 122912, 4.0, int(time.time() * 1000)],
    [-1, 136864, 4.0, int(time.time() * 1000)],
    [-1, 59315, 4.0, int(time.time() * 1000)],
    [-1, 102125, 3.5, int(time.time() * 1000)],
    [-1, 122904, 4.5, int(time.time() * 1000)],
    [-1, 99114, 4.0, int(time.time() * 1000)],
    [-1, 68157, 4.5, int(time.time() * 1000)],
    [-1, 162082, 4.0, int(time.time() * 1000)],
    [-1, 157340, 4.0, int(time.time() * 1000)],
    [-1, 149406, 4.0, int(time.time() * 1000)],
]

new_user_ratings = pd.DataFrame(new_user_ratings, columns=ratings.columns)
new_user_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,-1,89745,4.5,1742033128947
1,-1,122892,3.5,1742033128947
2,-1,122912,4.0,1742033128947
3,-1,136864,4.0,1742033128947
4,-1,59315,4.0,1742033128947
5,-1,102125,3.5,1742033128947
6,-1,122904,4.5,1742033128947
7,-1,99114,4.0,1742033128947
8,-1,68157,4.5,1742033128947
9,-1,162082,4.0,1742033128947


In [49]:
new_user_vector = calculate_user_vector(new_user_ratings)

  ratings_row[idx] += curr_rating


In [26]:
def load_movies_data():
    movies_data = pd.read_csv("Data/training_data/movies_encoded_data.csv")
    movie_ids = movies_data["movieID"].copy(deep=True)
    movies_data = movies_data.drop(["movieID"], axis=1)
    return movies_data, movie_ids

all_movies_data, movie_ids = load_movies_data()
all_movies_data.info()
movie_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            9742 non-null   float64
 1   average_rating  9742 non-null   float64
 2   War             9742 non-null   float64
 3   Thriller        9742 non-null   float64
 4   Sci-Fi          9742 non-null   float64
 5   Documentary     9742 non-null   float64
 6   Romance         9742 non-null   float64
 7   Horror          9742 non-null   float64
 8   Western         9742 non-null   float64
 9   Film-Noir       9742 non-null   float64
 10  Adventure       9742 non-null   float64
 11  Action          9742 non-null   float64
 12  Musical         9742 non-null   float64
 13  Mystery         9742 non-null   float64
 14  Crime           9742 non-null   float64
 15  Fantasy         9742 non-null   float64
 16  Unknown         9742 non-null   float64
 17  IMAX            9742 non-null   f

In [50]:
def recommend_movies(user_vector):
    ## Get Data
    all_movies_data, movie_ids = load_movies_data()
    model_user_data = np.tile(new_user_vector, (all_movies_data.shape[0], 1))

    ##Scale Data
    model_movies_data = movies_scaler.transform(all_movies_data)
    model_user_data = users_scaler.transform(model_user_data)

    ## Pass through Model
    result = model_recsys.predict([model_user_data, all_movies_data])

    ## inverse transform result
    pred_ratings = ratings_scaler.inverse_transform(result)
    indexes = np.argsort(pred_ratings, axis=0).reshape(-1)[::-1]

    ## Return Movie Indexes
    return movie_ids[indexes]    

pred_movie_ids = recommend_movies(new_user_ratings)

[1m 26/305[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step 



[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [52]:
pred_movie_ids[:10]

9542    172637.0
3759      5244.0
8551    115727.0
5448     26073.0
5503     26401.0
3081      4135.0
7097     70451.0
7525     84512.0
5466     26169.0
3110      4180.0
Name: movieID, dtype: float64

In [57]:
movies.loc[movies["movieId"].isin(pred_movie_ids[:20])]

Unnamed: 0,movieId,title,genres
1540,2075,Mephisto (1981),Drama|War
2740,3678,"Man with the Golden Arm, The (1955)",Drama
3081,4135,"Monster Squad, The (1987)",Adventure|Comedy|Horror
3110,4180,Reform School Girls (1986),Action|Drama
3759,5244,Shogun Assassin (1980),Action|Adventure
3908,5490,The Big Bus (1976),Action|Comedy
4474,6611,Umberto D. (1952),Drama
4595,6835,Alien Contamination (1980),Action|Horror|Sci-Fi
4783,7122,King of Hearts (1966),Comedy|Drama|War
5426,25887,Tales of Manhattan (1942),Comedy|Drama


In [None]:
def retrieve_ordered_ids(df, id_list, id_column='id'):
    """
    Retrieves rows from a DataFrame based on a list of IDs, maintaining the order.

    Args:
        df (pd.DataFrame): The DataFrame to search.
        id_list (list): A list of IDs to retrieve.
        id_column (str): The name of the column containing the IDs.

    Returns:
        pd.DataFrame: A DataFrame containing the retrieved rows in the specified order.
    """

    # Filter the DataFrame using isin()
    filtered_df = df[df[id_column].isin(id_list)]

    # Set the ID column as the index for efficient reindexing
    filtered_df = filtered_df.set_index(id_column)

    # Reindex the DataFrame using the provided ID list
    result_df = filtered_df.reindex(id_list)

    # Reset the index to restore the original DataFrame structure
    result_df = result_df.reset_index()

    return result_df

# Example Usage:
data = {'id': [3, 1, 4, 2, 5],
        'value': ['a', 'b', 'c', 'd', 'e']}
df = pd.DataFrame(data)

ids_to_find = [2, 5, 1]
ordered_result = retrieve_ordered_ids(df, ids_to_find)

print(ordered_result)

   id value
0   2     d
1   5     e
2   1     b


In [62]:
ordered_df = retrieve_ordered_ids(movies, pred_movie_ids, "movieId")

In [64]:
ordered_df.loc[:20]

Unnamed: 0,movieID,title,genres
0,172637.0,Priklyucheniya Kapitana Vrungelya (1979),Action|Adventure|Animation|Comedy
1,5244.0,Shogun Assassin (1980),Action|Adventure
2,115727.0,Crippled Avengers (Can que) (Return of the 5 D...,Action|Adventure
3,26073.0,"Human Condition III, The (Ningen no joken III)...",Drama|War
4,26401.0,Last Hurrah for Chivalry (Hao xia) (1979),Action|Drama
5,4135.0,"Monster Squad, The (1987)",Adventure|Comedy|Horror
6,70451.0,Max Manus (2008),Action|Drama|War
7,84512.0,Girls About Town (1931),Comedy
8,26169.0,Branded to Kill (Koroshi no rakuin) (1967),Action|Crime|Drama
9,4180.0,Reform School Girls (1986),Action|Drama
