<a href="https://colab.research.google.com/github/KhosrojerdiA/Machine-Learning/blob/main/Matrix_Factorization_Recommendation_System_for_Movie_and_Ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  955k  100  955k    0     0  3891k      0 --:--:-- --:--:-- --:--:-- 3899k


In [None]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

#Data

In [None]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [None]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#Movie ID to movie name mapping

In [None]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
movie_names

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (199

In [None]:
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


#Matrix Factorization

In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

#Creating the dataloader
 (necessary for PyTorch)

In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

#Train

In [None]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0018, 0.0473, 0.0044,  ..., 0.0300, 0.0009, 0.0282],
        [0.0308, 0.0293, 0.0069,  ..., 0.0459, 0.0432, 0.0116],
        [0.0453, 0.0132, 0.0072,  ..., 0.0070, 0.0268, 0.0121],
        ...,
        [0.0464, 0.0487, 0.0099,  ..., 0.0367, 0.0035, 0.0021],
        [0.0267, 0.0193, 0.0498,  ..., 0.0029, 0.0107, 0.0212],
        [0.0239, 0.0171, 0.0115,  ..., 0.0473, 0.0341, 0.0135]])
item_factors.weight tensor([[0.0336, 0.0122, 0.0101,  ..., 0.0151, 0.0439, 0.0475],
        [0.0101, 0.0432, 0.0345,  ..., 0.0395, 0.0465, 0.0465],
        [0.0236, 0.0161, 0.0343,  ..., 0.0380, 0.0156, 0.0097],
        ...,
        [0.0252, 0.0308, 0.0394,  ..., 0.0389, 0.0168, 0.0339],
        [0.0064, 0.0043, 0.0306,  ..., 0.0156, 0.0365, 0.0351],
        [0.0207, 0.0427, 0.0180,  ..., 0.0458, 0.0439, 0.0014]])


#Loss

In [None]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.065863218404315
iter #1 Loss: 4.738788210195938
iter #2 Loss: 2.473627145369041
iter #3 Loss: 1.7209917774357772
iter #4 Loss: 1.3459636861751527
iter #5 Loss: 1.1287444165061573
iter #6 Loss: 0.9917450522256986
iter #7 Loss: 0.9002730302550466
iter #8 Loss: 0.8373094522559703
iter #9 Loss: 0.7922691040232702
iter #10 Loss: 0.7593395073099185
iter #11 Loss: 0.7349661119320066
iter #12 Loss: 0.7164961915784681
iter #13 Loss: 0.7017917070291975
iter #14 Loss: 0.6903294096062631
iter #15 Loss: 0.6818526898165644
iter #16 Loss: 0.6750356718746539
iter #17 Loss: 0.6696957526397584
iter #18 Loss: 0.6659168760637342
iter #19 Loss: 0.6627221274648221
iter #20 Loss: 0.6607564916162927
iter #21 Loss: 0.6590387093914947
iter #22 Loss: 0.6574043662944421
iter #23 Loss: 0.6568694905127366
iter #24 Loss: 0.6558935157009188
iter #25 Loss: 0.6551737028935234
iter #26 Loss: 0.6542702953751922
iter #27 Loss: 0.6536310036775425
iter #28 Loss: 0.6526187912839924
iter #29 Loss: 0.651744398

By training the model, we will have tuned latent factors for movies and users.

In [None]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 0.8189,  2.0522,  1.1351,  ...,  1.4524,  0.5090,  1.8498],
        [ 1.3798,  1.2170,  0.8447,  ...,  0.4756,  2.1434, -0.4760],
        [ 2.3284,  0.2960, -1.5721,  ...,  1.9978,  1.4480, -1.4738],
        ...,
        [ 2.4570,  2.0888,  0.2969,  ...,  1.1664, -0.2427,  0.9154],
        [ 1.0229,  0.7652,  0.9543,  ...,  0.4046,  0.7490,  0.9965],
        [ 1.3264,  0.0242,  0.4783,  ...,  1.5483,  1.6223,  0.9764]],
       device='cuda:0')
item_factors.weight tensor([[0.1896, 0.2654, 0.7301,  ..., 0.6167, 0.7760, 0.5583],
        [0.3766, 0.2795, 0.4124,  ..., 0.0858, 0.5077, 0.1554],
        [0.4205, 0.1975, 0.4042,  ..., 0.7896, 0.3579, 0.4026],
        ...,
        [0.3545, 0.3268, 0.3541,  ..., 0.3685, 0.3465, 0.3642],
        [0.4067, 0.3577, 0.3806,  ..., 0.4172, 0.4378, 0.4364],
        [0.4131, 0.3827, 0.3820,  ..., 0.4388, 0.4365, 0.3951]],
       device='cuda:0')


#Result: trained embedding

In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings) # unique movie factor weights

9724

#Kmeans

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



##Print the first 10 movies in each cluster
It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.

In [None]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Godzilla (1998)
	 Richie Rich (1994)
	 Honey, I Blew Up the Kid (1992)
	 Battlefield Earth (2000)
	 Superman IV: The Quest for Peace (1987)
	 Nutty Professor II: The Klumps (2000)
	 Next Karate Kid, The (1994)
	 Karate Kid, Part III, The (1989)
	 Shark Tale (2004)
	 Highlander III: The Sorcerer (a.k.a. Highlander: The Final Dimension) (1994)
Cluster #1
	 Terminator 2: Judgment Day (1991)
	 Toy Story (1995)
	 Batman (1989)
	 Aladdin (1992)
	 Sixth Sense, The (1999)
	 True Lies (1994)
	 Lion King, The (1994)
	 Gladiator (2000)
	 Men in Black (a.k.a. MIB) (1997)
	 Dances with Wolves (1990)
Cluster #2
	 Ace Ventura: Pet Detective (1994)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 Kill Bill: Vol. 2 (2004)
	 Interview with the Vampire: The Vampire Chronicles (1994)
	 Get Shorty (1995)
	 Ace Ventura: When Nature Calls (1995)
	 Sin City (2005)
	 Starship Troopers (1997)
	 Dogma (1999)
	 40-Year-Old Virgin, The (2005)
Cluster #3
	 Waterworld (1995)
	 Net, The (1995)
	 Cliffhanger (1