<a href="https://colab.research.google.com/github/IoanRoume/RecommenderSystem/blob/main/Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("parasharmanas/movie-recommendation-system")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/parasharmanas/movie-recommendation-system/versions/1


In [None]:
!ls /root/.cache/kagglehub/datasets/parasharmanas/movie-recommendation-system/versions/1

movies.csv  ratings.csv


# **Collaborative filtering**

In [2]:
import pandas as pd

#movies = pd.read_csv("/root/.cache/kagglehub/datasets/parasharmanas/movie-recommendation-system/versions/1/movies.csv")
ratings = pd.read_csv("/root/.cache/kagglehub/datasets/parasharmanas/movie-recommendation-system/versions/1/ratings.csv")
ratings = ratings[:100000]

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [3]:
import torch
from torch.utils.data import random_split, DataLoader, Dataset
user_id_mapping = {id: idx for idx, id in enumerate(ratings['userId'].unique())}
movie_id_mapping = {id:idx for idx, id in enumerate(ratings['movieId'].unique())}

ratings['userId'] = ratings['userId'].map(user_id_mapping)
ratings['movieId'] = ratings['movieId'].map(movie_id_mapping)


class CustomDataset(Dataset):
  def __init__(self, users, movies, ratings):
    self.users = users
    self.movies = movies
    self.ratings = ratings

  def __getitem__(self, index):
    return self.users[index], self.movies[index], self.ratings[index]

  def __len__(self):
    return len(self.ratings)

users_list, movies_list,ratings_list = ratings['userId'].tolist(), ratings['movieId'].tolist(), ratings['rating'].tolist()

dataset = CustomDataset(users_list,movies_list,ratings_list)

full_size = len(dataset)

train_size = int(0.8 * full_size)
test_size = full_size - train_size

train_dataset, test_dataset = random_split(dataset, [train_size,test_size])


batch_size = 32
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size= batch_size, shuffle = False)


In [4]:
import torch.nn as nn
import torch.nn.functional as F

class MatrixFactorization(nn.Module):
  def __init__(self,num_users, num_movies, embed_dim):
    super(MatrixFactorization,self).__init__()
    self.user_embed = nn.Embedding(num_users, embed_dim)
    self.movie_embed = nn.Embedding(num_movies, embed_dim)

    self.user_bias = nn.Embedding(num_users, 1)
    self.movie_bias = nn.Embedding(num_movies, 1)


    nn.init.normal_(self.user_embed.weight, std=0.01)
    nn.init.normal_(self.movie_embed.weight, std=0.01)
    nn.init.zeros_(self.user_bias.weight)
    nn.init.zeros_(self.movie_bias.weight)

  def forward(self, user_ids, movie_ids):
    user_embed = self.user_embed(user_ids)
    movie_embed = self.movie_embed(movie_ids)

    interaction = (user_embed * movie_embed).sum(dim=1)

    user_b = self.user_bias(user_ids).squeeze()

    movie_b = self.movie_bias(movie_ids).squeeze()
    prediction = interaction + user_b + movie_b
    return prediction

In [5]:
num_users = len(user_id_mapping)
num_movies = len(movie_id_mapping)
embedding_dim = 50
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MatrixFactorization(num_users, num_movies, embedding_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [6]:
epochs = 5

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for users, movies, ratings in train_loader:
        users = users.to(device)
        movies = movies.to(device)
        ratings = ratings.to(device)

        optimizer.zero_grad()

        predictions = model(users, movies)

        loss = criterion(predictions.squeeze(), ratings.float())

        loss.backward()

        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/5, Loss: 3.3123
Epoch 2/5, Loss: 0.7991
Epoch 3/5, Loss: 0.4931
Epoch 4/5, Loss: 0.4238
Epoch 5/5, Loss: 0.3826


In [7]:
model.eval()
test_loss = 0.0

with torch.no_grad():
    for users, movies, ratings in test_loader:
        users = users.to(device)
        movies = movies.to(device)
        ratings = ratings.to(device)
        predictions = model(users, movies)
        loss = criterion(predictions.squeeze(), ratings)
        test_loss += loss.item()

avg_test_loss = test_loss / len(test_loader)
print(f"Test Loss: {avg_test_loss:.4f}")

Test Loss: 1.1273


# **Content-Based Filtering**

In [2]:
import pandas as pd

movies = pd.read_csv("/root/.cache/kagglehub/datasets/parasharmanas/movie-recommendation-system/versions/1/movies.csv")
movies = movies[:10000]
movies['genres'] = movies['genres'].str.split('|')

## One hot Encoding

In [3]:
all_genres = set(g for sublist in movies['genres'] for g in sublist)
for genre in all_genres:
  movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

# movies = movies.drop(columns=['genres'])
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                              genres  Musical  Documentary  \
0  [Adventure, Animation, Children, Comedy, Fantasy]        0            0   
1                     [Adventure, Children, Fantasy]        0            0   
2                                  [Comedy, Romance]        0            0   
3                           [Comedy, Drama, Romance]        0            0   
4                                           [Comedy]        0            0   

   Western  Drama  Mystery  Sci-Fi  Romance  ...  Horror  Fantasy  Thriller  \
0        0      0        0       0        0  ...       0        1         0   
1        0      0        0       0        0  ...       0        1       

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

genre_columns = movies.columns[3:]

similarity_matrix = cosine_similarity(movies[genre_columns])

similarity_df = pd.DataFrame(similarity_matrix, index=movies['title'], columns=movies['title'])
print(similarity_df.head())

title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.774597   
Jumanji (1995)                              0.774597        1.000000   
Grumpier Old Men (1995)                     0.316228        0.000000   
Waiting to Exhale (1995)                    0.258199        0.000000   
Father of the Bride Part II (1995)          0.447214        0.000000   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.316228   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.816497   
Father of the Bride Part II (1995)                 0.707107   

title                               Waiting to Exhale

In [5]:
def recommend_movies(movie_title, num_recommendations=5):
    if movie_title not in similarity_df:
        return "Movie not found in the dataset!"

    similar_movies = similarity_df[movie_title].sort_values(ascending=False)

    recommended_movies = similar_movies.iloc[1:num_recommendations+1]

    return recommended_movies

print(recommend_movies("Toy Story (1995)", 5))

title
Antz (1998)                                                1.0
DuckTales: The Movie - Treasure of the Lost Lamp (1990)    1.0
Emperor's New Groove, The (2000)                           1.0
Toy Story 2 (1999)                                         1.0
Adventures of Rocky and Bullwinkle, The (2000)             1.0
Name: Toy Story (1995), dtype: float64


## TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))


vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['genres'])


tfidf_similarity_matrix = cosine_similarity(tfidf_matrix)


tfidf_similarity_df = pd.DataFrame(tfidf_similarity_matrix, index=movies['title'], columns=movies['title'])


print(tfidf_similarity_df.head())

title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.789758   
Jumanji (1995)                              0.789758        1.000000   
Grumpier Old Men (1995)                     0.146751        0.000000   
Waiting to Exhale (1995)                    0.131975        0.000000   
Father of the Bride Part II (1995)          0.246375        0.000000   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.146751   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.899312   
Father of the Bride Part II (1995)                 0.595643   

title                               Waiting to Exhale

In [7]:
def recommend_movies_tfidf(movie_title, num_recommendations=5):
    if movie_title not in tfidf_similarity_df:
        return "Movie not found in the dataset!"

    similar_movies = tfidf_similarity_df[movie_title].sort_values(ascending=False)

    recommended_movies = similar_movies.iloc[1:num_recommendations+1]

    return recommended_movies

print(recommend_movies_tfidf("Toy Story (1995)", 5))

title
Emperor's New Groove, The (2000)                           1.0
Antz (1998)                                                1.0
DuckTales: The Movie - Treasure of the Lost Lamp (1990)    1.0
Toy Story 2 (1999)                                         1.0
Monsters, Inc. (2001)                                      1.0
Name: Toy Story (1995), dtype: float64
