<a href="https://colab.research.google.com/github/John-Wassef/Movie-Recommendation-System/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# Download the MovieLens dataset using curl
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1846k      0 --:--:-- --:--:-- --:--:-- 1844k


In [19]:
# Import the zipfile module to handle zip files
import zipfile

# Extract the contents of the downloaded zip file into the 'data' directory
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')


In [20]:
# Import pandas for data manipulation
import pandas as pd

# Load the movies and ratings data into DataFrames
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')


In [21]:
# Print the dimensions (rows, columns) of the movies and ratings DataFrames
print('The dimensions of movies dataframe are:', movies_df.shape, '\nThe dimensions of ratings dataframe are:', ratings_df.shape)


The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [22]:
# Display the first few rows of the movies DataFrame
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [23]:
# Display the first few rows of the ratings DataFrame
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [24]:
# Create a dictionary to map movie IDs to movie titles for quick lookup
movie_names = movies_df.set_index('movieId')['title'].to_dict()

# Calculate the number of unique users in the ratings DataFrame
n_users = len(ratings_df.userId.unique())

# Calculate the number of unique movies in the ratings DataFrame
n_items = len(ratings_df.movieId.unique())

# Print the number of unique users
print("Number of unique users:", n_users)

# Print the number of unique movies
print("Number of unique movies:", n_items)

# Calculate and print the total number of elements in the full rating matrix
print("The full rating matrix will have:", n_users * n_items, 'elements.')

print('----------')

# Print the total number of ratings in the dataset
print("Number of ratings:", len(ratings_df))

# Calculate and print the percentage of the matrix that is filled with ratings
print("Therefore:", len(ratings_df) / (n_users * n_items) * 100, '% of the matrix is filled.')

# Explain the sparsity of the matrix
print("We have an incredibly sparse matrix to work with here.")

# Explain the implication of matrix size growing with the number of users and products
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n^2")

# Discuss the memory challenges associated with storing a full matrix in memory
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")

# Highlight the advantage of matrix factorization in handling sparse matrices
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore: 1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n^2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [25]:
# Import necessary libraries
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

# Define the MatrixFactorization class inheriting from torch.nn.Module
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # Create user embeddings (lookup table for users)
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # Create item embeddings (lookup table for items)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        # Initialize the weights of the embeddings with a uniform distribution
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # Perform matrix multiplication to get the predicted ratings
        users, items = data[:, 0], data[:, 1]
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

    def predict(self, user, item):
        # Predict the rating for a given user and item
        return self.forward(user, item)


In [33]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch
import pandas as pd

class Loader(Dataset):
    def __init__(self, ratings: pd.DataFrame):
        # Use the provided ratings DataFrame directly
        self.ratings = ratings

        # Extract all unique user IDs and movie IDs
        unique_user_ids = ratings.userId.unique()
        unique_movie_ids = ratings.movieId.unique()

        # Create dictionaries to map original IDs to new continuous IDs
        self.user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
        self.movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(unique_movie_ids)}

        # Create reverse dictionaries to map new continuous IDs back to original IDs
        self.index_to_user_id = {idx: user_id for user_id, idx in self.user_id_to_index.items()}
        self.index_to_movie_id = {idx: movie_id for movie_id, idx in self.movie_id_to_index.items()}

        # Update DataFrame to use new continuous IDs for users and movies
        self.ratings['movieId'] = ratings['movieId'].map(self.movie_id_to_index)
        self.ratings['userId'] = ratings['userId'].map(self.user_id_to_index)

        # Drop the 'rating' and 'timestamp' columns and extract feature and target values
        self.features = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.targets = self.ratings['rating'].values

        # Transform the data to tensors (ready for torch models)
        self.features_tensor = torch.tensor(self.features, dtype=torch.long)
        self.targets_tensor = torch.tensor(self.targets, dtype=torch.float32)

    def __getitem__(self, index: int):
        # Return the data and target value for the given index
        return (self.features_tensor[index], self.targets_tensor[index])

    def __len__(self) -> int:
        # Return the length of the dataset
        return len(self.ratings)


In [27]:
# Set the number of epochs for training
num_epochs = 128

# Check if CUDA (GPU) is available
cuda = torch.cuda.is_available()

# Print whether the code will run on GPU
print("Is running on GPU:", cuda)

# Initialize the matrix factorization model with the number of users, items, and factors
model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)

# Print the model parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# Enable GPU if available
if cuda:
    model = model.cuda()

# Define the loss function (Mean Squared Error Loss)
loss_fn = torch.nn.MSELoss()

# Define the optimizer (Adam optimizer with learning rate 1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Create the training dataset and dataloader
train_set = Loader()
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)


Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0391, 0.0485, 0.0406,  ..., 0.0147, 0.0467, 0.0231],
        [0.0138, 0.0431, 0.0471,  ..., 0.0108, 0.0355, 0.0481],
        [0.0313, 0.0481, 0.0058,  ..., 0.0454, 0.0389, 0.0360],
        ...,
        [0.0459, 0.0228, 0.0359,  ..., 0.0236, 0.0251, 0.0188],
        [0.0460, 0.0448, 0.0048,  ..., 0.0206, 0.0200, 0.0087],
        [0.0422, 0.0171, 0.0498,  ..., 0.0094, 0.0144, 0.0013]])
item_factors.weight tensor([[0.0119, 0.0077, 0.0416,  ..., 0.0267, 0.0023, 0.0202],
        [0.0180, 0.0152, 0.0425,  ..., 0.0288, 0.0108, 0.0160],
        [0.0037, 0.0255, 0.0461,  ..., 0.0099, 0.0039, 0.0464],
        ...,
        [0.0258, 0.0175, 0.0236,  ..., 0.0397, 0.0041, 0.0045],
        [0.0319, 0.0185, 0.0480,  ..., 0.0046, 0.0064, 0.0072],
        [0.0385, 0.0204, 0.0379,  ..., 0.0374, 0.0258, 0.0038]])


In [28]:
# Initialize counters and variables to store embeddings
c = 0
uw = 0
iw = 0

# Loop through the model's named parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        # Print the name and data of the parameter
        print(name, param.data)
        # Store the user embeddings in uw if c is 0, otherwise store item embeddings in iw
        if c == 0:
            uw = param.data
            c += 1
        else:
            iw = param.data


user_factors.weight tensor([[0.0391, 0.0485, 0.0406,  ..., 0.0147, 0.0467, 0.0231],
        [0.0138, 0.0431, 0.0471,  ..., 0.0108, 0.0355, 0.0481],
        [0.0313, 0.0481, 0.0058,  ..., 0.0454, 0.0389, 0.0360],
        ...,
        [0.0459, 0.0228, 0.0359,  ..., 0.0236, 0.0251, 0.0188],
        [0.0460, 0.0448, 0.0048,  ..., 0.0206, 0.0200, 0.0087],
        [0.0422, 0.0171, 0.0498,  ..., 0.0094, 0.0144, 0.0013]],
       device='cuda:0')
item_factors.weight tensor([[0.0119, 0.0077, 0.0416,  ..., 0.0267, 0.0023, 0.0202],
        [0.0180, 0.0152, 0.0425,  ..., 0.0288, 0.0108, 0.0160],
        [0.0037, 0.0255, 0.0461,  ..., 0.0099, 0.0039, 0.0464],
        ...,
        [0.0258, 0.0175, 0.0236,  ..., 0.0397, 0.0041, 0.0045],
        [0.0319, 0.0185, 0.0480,  ..., 0.0046, 0.0064, 0.0072],
        [0.0385, 0.0204, 0.0379,  ..., 0.0374, 0.0258, 0.0038]],
       device='cuda:0')


In [29]:
# Extract the trained movie embeddings from the model
# Move the embeddings to CPU memory and convert them to a numpy array
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()


In [30]:
# Print the number of trained movie embeddings
len(trained_movie_embeddings)


9724

In [31]:
# Import the KMeans class from sklearn.cluster
from sklearn.cluster import KMeans

# Fit the KMeans clustering algorithm on the trained movie embeddings
# Set the number of clusters to 10 and the random seed to 0 for reproducibility
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)




In [32]:
# Loop through each cluster from 0 to 9 (10 clusters)
for cluster in range(10):
    # Print the current cluster number
    print("Cluster #{}".format(cluster))
    movs = []

    # Find the indices of movies that belong to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        # Map the movie index back to the original movie ID
        movid = train_set.idx2movieid[movidx]
        # Count the number of ratings for the current movie
        rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
        # Append the movie title and its rating count to the movs list
        movs.append((movie_names[movid], rat_count))

    # Sort the movies in the current cluster by rating count in descending order
    # Print the top 10 movies in the current cluster
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])


Cluster #0
	 Independence Day (a.k.a. ID4) (1996)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Dances with Wolves (1990)
	 Pirates of the Caribbean: The Curse of the Black Pearl (2003)
	 Die Hard: With a Vengeance (1995)
	 Pretty Woman (1990)
	 Reservoir Dogs (1992)
	 Godfather: Part II, The (1974)
	 E.T. the Extra-Terrestrial (1982)
Cluster #1
	 Shawshank Redemption, The (1994)
	 Matrix, The (1999)
	 Toy Story (1995)
	 Apollo 13 (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Dark Knight, The (2008)
	 Good Will Hunting (1997)
	 X-Men (2000)
	 American History X (1998)
	 Shining, The (1980)
Cluster #2
	 Braveheart (1995)
	 Seven (a.k.a. Se7en) (1995)
	 Fugitive, The (1993)
	 Saving Private Ryan (1998)
	 Titanic (1997)
	 Truman Show, The (1998)
	 Batman Begins (2005)
	 Home Alone (1990)
	 Ghost (1990)
	 Net, The (1995)
Cluster #3
	 Jurassic Park (1993)
	 Fight Club (1999)
	 Godfather, The (197