In [1]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on 
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. 

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  955k    0  2283    0     0   1635      0  0:09:58  0:00:01  0:09:57  1638
  9  955k    9 91883    0     0  39941      0  0:00:24  0:00:02  0:00:22 40001
 82  955k   82  788k    0     0   245k      0  0:00:03  0:00:03 --:--:--  245k
100  955k  100  955k    0     0   296k      0  0:00:03  0:00:03 --:--:--  296k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')
     

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [7]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [11]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        
        self.user_factors = torch.nn.Embedding(n_users, n_factors) 
       
        self.item_factors = torch.nn.Embedding(n_items, n_factors) 
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
   
    
    def predict(self, user, item):
        return self.forward(user, item)

In [12]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        
        #--- Producing new continuous IDs for users and movies ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [13]:
um_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0467, 0.0319, 0.0070,  ..., 0.0107, 0.0044, 0.0497],
        [0.0496, 0.0168, 0.0038,  ..., 0.0199, 0.0245, 0.0019],
        [0.0373, 0.0286, 0.0023,  ..., 0.0252, 0.0486, 0.0458],
        ...,
        [0.0120, 0.0394, 0.0307,  ..., 0.0071, 0.0481, 0.0200],
        [0.0239, 0.0055, 0.0484,  ..., 0.0435, 0.0096, 0.0325],
        [0.0399, 0.0072, 0.0494,  ..., 0.0490, 0.0044, 0.0276]])
item_factors.weight tensor([[0.0026, 0.0024, 0.0243,  ..., 0.0300, 0.0493, 0.0394],
        [0.0019, 0.0196, 0.0397,  ..., 0.0078, 0.0437, 0.0240],
        [0.0327, 0.0099, 0.0072,  ..., 0.0362, 0.0338, 0.0320],
        ...,
        [0.0495, 0.0093, 0.0205,  ..., 0.0368, 0.0440, 0.0020],
        [0.0456, 0.0152, 0.0362,  ..., 0.0273, 0.0353, 0.0286],
        [0.0399, 0.0150, 0.0020,  ..., 0.0186, 0.0107, 0.0195]])


In [16]:
# Import necessary libraries
from tqdm import tqdm
import torch

# Define the number of epochs
num_epochs = 10  # Set this to the desired number of epochs

# Assuming the following variables are defined: 
# model, optimizer, loss_fn, train_loader, and cuda (whether to use GPU or CPU)

for it in tqdm(range(num_epochs)):  # Add tqdm here to create the progress bar
    losses = []
    for x, y in train_loader:  # Iterate through the training data
        if cuda:  # Check if CUDA (GPU) is available
            x, y = x.cuda(), y.cuda()  # Move data to GPU
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(x)  # Forward pass
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))  # Calculate the loss
        losses.append(loss.item())  # Append loss value
        loss.backward()  # Backward pass
        optimizer.step()  # Update model weights
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))  # Print average loss for each epoch


 10%|████████▎                                                                          | 1/10 [00:02<00:23,  2.59s/it]

iter #0 Loss: 11.070508123049276


 20%|████████████████▌                                                                  | 2/10 [00:04<00:19,  2.47s/it]

iter #1 Loss: 4.744322446397113


 30%|████████████████████████▉                                                          | 3/10 [00:07<00:16,  2.40s/it]

iter #2 Loss: 2.4742433536173727


 40%|█████████████████████████████████▏                                                 | 4/10 [00:09<00:14,  2.36s/it]

iter #3 Loss: 1.7206287076176725


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:12<00:12,  2.43s/it]

iter #4 Loss: 1.3456823957450499


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:14<00:09,  2.40s/it]

iter #5 Loss: 1.1282073735888234


 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:16<00:07,  2.43s/it]

iter #6 Loss: 0.9914843270621324


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:19<00:04,  2.43s/it]

iter #7 Loss: 0.9004127735716437


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:21<00:02,  2.44s/it]

iter #8 Loss: 0.8372858684831465


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:24<00:00,  2.46s/it]

iter #9 Loss: 0.7922594704603786





In [17]:
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[1.2977, 1.2733, 1.2599,  ..., 1.2630, 1.2706, 1.2864],
        [1.0415, 1.0143, 0.9919,  ..., 1.0199, 1.0132, 1.0043],
        [0.6200, 0.6192, 0.5671,  ..., 0.6157, 0.6153, 0.6018],
        ...,
        [0.9778, 0.9970, 0.9951,  ..., 0.9543, 1.0074, 0.9900],
        [0.9670, 0.9370, 0.9887,  ..., 0.9694, 0.9451, 0.9660],
        [1.2015, 1.1729, 1.2145,  ..., 1.2281, 1.2031, 1.2076]])
item_factors.weight tensor([[0.4630, 0.4663, 0.4864,  ..., 0.4873, 0.4936, 0.5057],
        [0.3657, 0.3726, 0.3963,  ..., 0.3652, 0.3956, 0.3878],
        [0.4937, 0.4784, 0.4732,  ..., 0.4914, 0.4871, 0.4914],
        ...,
        [0.2625, 0.2224, 0.2332,  ..., 0.2497, 0.2576, 0.2149],
        [0.2607, 0.2303, 0.2512,  ..., 0.2424, 0.2509, 0.2436],
        [0.2495, 0.2246, 0.2115,  ..., 0.2283, 0.2208, 0.2291]])


In [18]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
     

In [19]:
len(trained_movie_embeddings) # unique movie factor weights
     rom sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)
     

9724

In [21]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)
     

In [22]:
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid])
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 Coneheads (1993)
	 Mortal Kombat (1995)
	 Flintstones, The (1994)
	 Striptease (1996)
	 Hollow Man (2000)
	 Free Willy (1993)
	 Showgirls (1995)
	 Hulk (2003)
	 Daredevil (2003)
	 Bio-Dome (1996)
Cluster #1
	 Jurassic Park (1993)
	 Aladdin (1992)
	 True Lies (1994)
	 Back to the Future (1985)
	 Speed (1994)
	 Shrek (2001)
	 Dances with Wolves (1990)
	 Pirates of the Caribbean: The Curse of the Black Pearl (2003)
	 Beauty and the Beast (1991)
	 Die Hard (1988)
Cluster #2
	 Anaconda (1997)
	 Speed 2: Cruise Control (1997)
	 Battlefield Earth (2000)
	 Superman IV: The Quest for Peace (1987)
	 Karate Kid, Part III, The (1989)
	 Sister Act 2: Back in the Habit (1993)
	 Ultraviolet (2006)
	 Dungeons & Dragons (2000)
	 Rambo III (1988)
	 Problem Child (1990)
Cluster #3
	 Ace Ventura: Pet Detective (1994)
	 Mask, The (1994)
	 Star Wars: Episode I - The Phantom Menace (1999)
	 Batman Forever (1995)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 Waterworld (1995)
	 Net, The (1995)
	 Cli