In [1]:
import numpy as np
import pandas as pd
import os 
import shutil
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
movies = pd.read_csv('.\Dataset\movies.csv',sep=';',encoding='latin-1').drop('Unnamed: 3',axis=1)
print('Shape of this dataset :',movies.shape)
movies.head()

Shape of this dataset : (3883, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('.\Dataset\\ratings.csv',sep=';')
print('Shape of this dataset :',ratings.shape)
ratings.head()

Shape of this dataset : (1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
users = pd.read_csv('.\Dataset\\users.csv',sep=';')
print('Shape of this dataset :',users.shape)
users.head()

Shape of this dataset : (6040, 5)


Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
rating_pivot = ratings.pivot_table(values='rating',columns='userId',index='movieId')
rating_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,


In [9]:
# Now use matrix factorization to predict the ratings

import torch
import torch.nn as nn
import torch.nn.functional as F

# Create a class for the model

class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=20):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)

    def forward(self, user, movie):
        return (self.user_factors(user) * self.movie_factors(movie)).sum(1)      

In [12]:
pivot_tensor = torch.tensor(rating_pivot.values, dtype=torch.float32)
mask = ~torch.isnan(pivot_tensor)

# Get the indices of the non-NaN values
i, j = torch.where(mask)

# Get the values of the non-NaN values
v = pivot_tensor[mask]

# Store in PyTorch tensors
users_nn= i.to(torch.int64)
movies_nn = j.to(torch.int64)
ratings_nn = v.to(torch.float32) 

movie_user_df = pd.DataFrame({'user': users_nn, 'movie': movies_nn, 'rating': ratings_nn})
movie_user_df

Unnamed: 0,user,movie,rating
0,0,0,5.0
1,0,5,4.0
2,0,7,4.0
3,0,8,5.0
4,0,9,5.0
...,...,...,...
1000204,3705,5811,4.0
1000205,3705,5830,3.0
1000206,3705,5836,4.0
1000207,3705,5926,1.0


In [14]:
# Fit the Matrix Factorization model
import torch.optim as optim

n_users = rating_pivot.shape[0]
n_movies = rating_pivot.shape[1]
model = MatrixFactorization(n_users, n_movies, 4)
optimizer = optim.Adam(model.parameters(), lr=0.01)

for i in range(1000):
    # Compute the loss
    pred = model(users_nn, movies_nn)
    loss = F.mse_loss(pred, ratings_nn)
    
    # Zero the gradients
    optimizer.zero_grad()
    
    # Backpropagate
    loss.backward()
    
    # Update the parameters
    optimizer.step()
    
    # Print the loss
    if i % 100 == 0:
        print(loss.item())

17.86431884765625
6.044564723968506
1.0223368406295776
0.8705870509147644
0.8263329863548279
0.8010669350624084
0.7779580950737
0.7575641870498657
0.7433661222457886
0.734330415725708


In [30]:
with torch.no_grad():
    pred_raing_pivot = pd.DataFrame(model.user_factors.weight @ model.movie_factors.weight.t(), index=rating_pivot.index, columns=rating_pivot.columns)
    # round to nearest integer
    pred_raing_pivot = pred_raing_pivot.round() 
pred_raing_pivot = np.clip(pred_raing_pivot, 1, 5)
# column_list = pred_raing_pivot.columns.values.tolist()
# for column_name in column_list:
#     print(pred_raing_pivot[column_name].unique())

[4. 3. 2. 5. 1.]
[4. 3. 5. 2. 1.]
[4. 3. 5. 2. 1.]
[4. 3. 2. 5. 1.]
[3. 2. 1. 4. 5.]
[4. 3. 5. 2. 1.]
[5. 4. 3. 2. 1.]
[5. 3. 4. 2. 1.]
[4. 3. 2. 5. 1.]
[5. 4. 3. 2. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 5. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[5. 4. 3. 2. 1.]
[5. 4. 3. 2. 1.]
[5. 3. 4. 2. 1.]
[4. 3. 2. 5. 1.]
[5. 3. 4. 2. 1.]
[3. 2. 5. 4. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 5. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 5. 2. 1.]
[3. 4. 2. 1. 5.]
[4. 2. 3. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 5. 2. 1.]
[4. 2. 3. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 5. 2. 1.]
[4. 2. 3. 1. 5.]
[5. 4. 3. 2. 1.]
[4. 3. 5. 1. 2.]
[4. 3. 2. 5. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 5. 2. 1.]
[5. 4. 3. 2. 1.]
[4. 3. 2. 5. 1.]
[4. 3. 2. 1. 5.]
[5. 4. 3. 2. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 1. 5.]
[4. 3. 2. 5. 1.]
[4. 3. 2. 5. 1.]
[4. 3. 5. 2. 1.]
[4. 3. 2. 1. 5.]
[5. 4. 3. 2. 1.]
[4. 3. 2. 1. 5.]
[4. 3. 5. 2. 1.]
[4. 3. 2. 5. 1.]
[4. 2. 3. 1. 5.]
[5. 3. 4. 2. 1.]
[4. 3. 2. 1. 5

In [17]:
users

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,1060


In [33]:
user_id = 6040
# movie_ids are all those movieId in pred_rating_pivot that have a NaN value for the user_id in rating_pivot
movie_ids = pred_raing_pivot.loc[:, user_id][rating_pivot.loc[:, user_id].isna()].sort_values(ascending=False).index.values
movie_ids.shape

(3365,)

In [35]:
recommended_movies = movies[movies['movieId'].isin(movie_ids)]['title'].head(10).values
recommended_movies

array(['Jumanji (1995)', 'Grumpier Old Men (1995)',
       'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)',
       'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)',
       'Sudden Death (1995)', 'GoldenEye (1995)',
       'American President, The (1995)'], dtype=object)