In [50]:
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [51]:
movies_df = pd.read_csv("ml-latest-small/movies.csv")
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")

In [52]:
print(f"Dimension of movies {movies_df.shape}\nDimension of ratings {ratings_df.shape}")

Dimension of movies (9742, 3)
Dimension of ratings (100836, 4)


In [53]:
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
movie_names = movies_df.set_index('movieId')['title'].to_dict()

In [54]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [55]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm.notebook import tqdm



class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [56]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        
        #--- Producing new continuous IDs for users and movies ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)
     

In [57]:
num_epochs = 200
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)
     

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0305, 0.0073, 0.0380,  ..., 0.0219, 0.0074, 0.0336],
        [0.0189, 0.0146, 0.0146,  ..., 0.0325, 0.0434, 0.0187],
        [0.0377, 0.0300, 0.0040,  ..., 0.0203, 0.0106, 0.0438],
        ...,
        [0.0027, 0.0424, 0.0046,  ..., 0.0106, 0.0159, 0.0052],
        [0.0056, 0.0256, 0.0232,  ..., 0.0469, 0.0130, 0.0187],
        [0.0497, 0.0004, 0.0367,  ..., 0.0112, 0.0198, 0.0099]])
item_factors.weight tensor([[0.0123, 0.0169, 0.0232,  ..., 0.0400, 0.0498, 0.0355],
        [0.0013, 0.0213, 0.0139,  ..., 0.0043, 0.0237, 0.0086],
        [0.0347, 0.0142, 0.0052,  ..., 0.0289, 0.0135, 0.0360],
        ...,
        [0.0072, 0.0195, 0.0045,  ..., 0.0073, 0.0101, 0.0042],
        [0.0271, 0.0118, 0.0036,  ..., 0.0281, 0.0093, 0.0065],
        [0.0487, 0.0265, 0.0012,  ..., 0.0210, 0.0408, 0.0369]])


In [58]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x12d05216210>

In [59]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
        


  0%|          | 0/200 [00:00<?, ?it/s]

In [60]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
len(trained_movie_embeddings) # unique movie factor weights

9724

In [61]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [62]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Pulp Fiction (1994)
	 Jurassic Park (1993)
	 American Beauty (1999)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Pretty Woman (1990)
	 Terminator, The (1984)
	 Fifth Element, The (1997)
	 Net, The (1995)
	 Star Trek: Generations (1994)
	 Up (2009)
Cluster #1
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Godfather, The (1972)
	 Good Will Hunting (1997)
	 Austin Powers: The Spy Who Shagged Me (1999)
	 Green Mile, The (1999)
	 Jumanji (1995)
	 Shining, The (1980)
	 Interview with the Vampire: The Vampire Chronicles (1994)
	 Departed, The (2006)
Cluster #2
	 Fugitive, The (1993)
	 Lord of the Rings: The Return of the King, The (2003)
	 Sixth Sense, The (1999)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Gladiator (2000)
	 Men in Black (a.k.a. MIB) (1997)
	 Die Hard (1988)
	 Princess Bride, The (1987)
	 Monty Python and the Holy Grail (1975)
	 Dumb & Dumber (Dumb and Dumber) (