# Import Libraries

In [15]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Embedding, Input, Flatten, Dot, Dense, Dropout, Concatenate
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
import re

# Load Datasets

In [2]:
movies = pd.read_csv('/content/drive/MyDrive/Recommender Dataset/Project/ml-latest-small/movies.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Recommender Dataset/Project/ml-latest-small/ratings.csv')

# Data Preprocessing

In [3]:
def correct_movie_titles(movies):
    def correct_title(title):
        return re.sub(r'(.+), The$', r'The \1', title)

    movies['title'] = movies['title'].apply(correct_title)
    return movies

In [4]:
movies = correct_movie_titles(movies)

In [22]:
has_duplicates = movies.duplicated().any()
has_duplicates

False

In [5]:
# Get the maximum user and movie IDs
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

In [6]:
# Shuffle the data to ensure randomness
shuffled_ratings = ratings.sample(frac=1., random_state=42)

In [11]:
# Extract users, movies, and ratings arrays
Users = shuffled_ratings['userId'].values
Movies = shuffled_ratings['movieId'].values
Ratings = shuffled_ratings['rating'].values

In [12]:
# Define constants
K_FACTORS = 100  # The number of latent factors for embeddings
TEST_USER = 1

# Model

In [8]:
class CFModel(Model):
    def __init__(self, num_users, num_items, embedding_size):
        # User embedding
        user_input = Input(shape=(1,))
        user_embedding = Embedding(num_users, embedding_size, input_length=1)(user_input)
        user_vec = Flatten()(user_embedding)

        # Item embedding
        item_input = Input(shape=(1,))
        item_embedding = Embedding(num_items, embedding_size, input_length=1)(item_input)
        item_vec = Flatten()(item_embedding)

        # Dot product of user and item embeddings to predict ratings
        dot = Dot(axes=1)([user_vec, item_vec])

        super().__init__(inputs=[user_input, item_input], outputs=dot)
        self.compile(optimizer='adam', loss='mse')

In [9]:
# Instantiate the model
num_users = max_userid + 1
num_items = max_movieid + 1
cf_model = CFModel(num_users, num_items, K_FACTORS)

In [10]:
# Compile the model using MSE as the loss function and the Adam optimizer
cf_model.compile(loss='mse', optimizer='adam')

In [11]:
# Train the model
cf_model.fit([Users, Movies], Ratings, epochs=5, batch_size=64, validation_split=0.2,
             callbacks=[EarlyStopping(patience=2), ModelCheckpoint('model.h5', save_best_only=True)])

Epoch 1/5

  saving_api.save_model(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78105b942a10>

In [12]:
# Function to get top-N recommendations for a user
def get_user_based_recommendations(user_id, num_recommendations=10):
    all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
    predicted_ratings = cf_model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
    top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
    recommended_movie_ids = all_movie_ids[top_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

In [13]:
# For test
user_id = 3
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

      movieId                                              title  \
974      1275                                  Highlander (1986)   
1701     2288                                  Thing, The (1982)   
2765     3703               Road Warrior, The (Mad Max 2) (1981)   
3734     5181                                   Hangar 18 (1980)   
4045     5746                    Galaxy of Terror (Quest) (1981)   
4050     5764                                      Looker (1981)   
4122     5919                                     Android (1982)   
5052     7899  Master of the Flying Guillotine (Du bi quan wa...   
5504    26409                          Clonus Horror, The (1979)   
7114    70946                                     Troll 2 (1990)   

                                genres  
974           Action|Adventure|Fantasy  
1701     Action|Horror|Sci-Fi|Thriller  
2765  Action|Adventure|Sci-Fi|Thriller  
3734            Action|Sci-Fi|Thriller  
4045      Action|Horror|Mystery|Sci-Fi  
4050 

# Evaluation

In [14]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Prepare the test data for evaluation
test_users = test_data['userId'].values
test_movies = test_data['movieId'].values
test_ratings = test_data['rating'].values

In [15]:
# Predict ratings for the test set
predicted_test_ratings = cf_model.predict([test_users, test_movies])

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_test_ratings))
rmse



0.6713010405425375

# Deeper Model

In [14]:
# Instantiate the deeper model
num_users = max_userid + 1
num_items = max_movieid + 1

In [16]:
class DeepCFModel(Model):
    def __init__(self, num_users, num_items, embedding_size):
        # User embedding
        user_input = Input(shape=(1,))
        user_embedding = Embedding(num_users, embedding_size, input_length=1)(user_input)
        user_vec = Flatten()(user_embedding)

        # Item embedding
        item_input = Input(shape=(1,))
        item_embedding = Embedding(num_items, embedding_size, input_length=1)(item_input)
        item_vec = Flatten()(item_embedding)

        # Concatenate embeddings
        concat = Concatenate()([user_vec, item_vec])

        # Add dense layers with dropout
        dense = Dense(128, activation='relu')(concat)
        dropout = Dropout(0.5)(dense)
        dense = Dense(64, activation='relu')(dropout)
        dropout = Dropout(0.5)(dense)
        dense = Dense(32, activation='relu')(dropout)
        dropout = Dropout(0.5)(dense)

        # Output layer
        output = Dense(1)(dropout)

        super().__init__(inputs=[user_input, item_input], outputs=output)
        self.compile(optimizer='adam', loss='mse')


In [17]:
deep_cf_model = DeepCFModel(num_users, num_items, K_FACTORS)

# Compile the model using MSE as the loss function and the Adam optimizer
deep_cf_model.compile(loss='mse', optimizer='adam')

# Train the model
deep_cf_model.fit([Users, Movies], Ratings, epochs=10, batch_size=64, validation_split=0.2,
                  callbacks=[EarlyStopping(patience=2), ModelCheckpoint('deep_model.h5', save_best_only=True)])


Epoch 1/10

  saving_api.save_model(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x7da50c0e9210>

In [19]:
def get_user_based_recommendations(user_id, num_recommendations=10):
    all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
    predicted_ratings = deep_cf_model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
    top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
    recommended_movie_ids = all_movie_ids[top_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

In [21]:
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

NameError: name 'recommendations' is not defined

In [22]:
# Define constants
K_FACTORS = 100  # The number of latent factors for embeddings
TEST_USER = 1  # A random test user

# Instantiate the deeper model
num_users = max_userid + 1
num_items = max_movieid + 1

# Define the DeepCFModel class
class DeepCFModel(Model):
    def __init__(self, num_users, num_items, embedding_size):
        # User embedding
        user_input = Input(shape=(1,))
        user_embedding = Embedding(num_users, embedding_size, input_length=1)(user_input)
        user_vec = Flatten()(user_embedding)

        # Item embedding
        item_input = Input(shape=(1,))
        item_embedding = Embedding(num_items, embedding_size, input_length=1)(item_input)
        item_vec = Flatten()(item_embedding)

        # Concatenate embeddings
        concat = Concatenate()([user_vec, item_vec])

        # Add dense layers with dropout
        dense = Dense(128, activation='relu')(concat)
        dropout = Dropout(0.5)(dense)
        dense = Dense(64, activation='relu')(dropout)
        dropout = Dropout(0.5)(dense)
        dense = Dense(32, activation='relu')(dropout)
        dropout = Dropout(0.5)(dense)

        # Output layer
        output = Dense(1)(dropout)

        super().__init__(inputs=[user_input, item_input], outputs=output)
        self.compile(optimizer='adam', loss='mse')

deep_cf_model = DeepCFModel(num_users, num_items, K_FACTORS)

# Compile the model using MSE as the loss function and the Adam optimizer
deep_cf_model.compile(loss='mse', optimizer='adam')

# Train the model
deep_cf_model.fit([Users, Movies], Ratings, epochs=10, batch_size=64, validation_split=0.2,
                  callbacks=[EarlyStopping(patience=2), ModelCheckpoint('deep_model.h5', save_best_only=True)])

# Function to get top-N recommendations for a user
def get_user_based_recommendations(user_id, num_recommendations=10):
    all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
    predicted_ratings = deep_cf_model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
    top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
    recommended_movie_ids = all_movie_ids[top_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

# Example usage
user_id = 1  # Change this to the desired user ID
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
      movieId                                              title  \
1072     1392                                Citizen Ruth (1996)   
1649     2202                                    Lifeboat (1944)   
2479     3302                            Beautiful People (1999)   
2582     3451                Guess Who's Coming to Dinner (1967)   
2597     3473  Jonah Who Will Be 25 in the Year 2000 (Jonas q...   
4396     6460                     Trial, The (Procès, Le) (1962)   
4782     7121                                  Adam's Rib (1949)   
6048    40412                            Dead Man's Shoes (2004)   
7364    78836                              Enter the Void (2009)   
7593    86377                       Louis C.K.: Shameless (2007)   

              genres  
1072    Comedy|Drama  
1649       Drama|War  
2479          Comedy  
2582           Drama  
2597          Comedy  
4396           Drama  
4782  Comedy|

In [24]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

In [25]:
# Prepare the test data for evaluation
test_users = test_data['userId'].values
test_movies = test_data['movieId'].values
test_ratings = test_data['rating'].values

# Predict ratings for the test set
predicted_test_ratings = deep_cf_model.predict([test_users, test_movies])

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_test_ratings))
print(f'RMSE: {rmse}')

RMSE: 0.7545061377427207


In [26]:
deep_cf_model.save('user_based_recommendation_model.h5')

  saving_api.save_model(
