In [1]:
import requests

url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
response = requests.get(url, stream=True)

if response.status_code == 200:
    with open("ml-latest-small.zip", "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    print("Download complete!")
else:
    print(f"Failed to download. Status code: {response.status_code}")


Download complete!


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')


In [3]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [4]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [5]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.


In [9]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

In [30]:
# import torch
# import numpy as np
# from torch.autograd import Variable
# from tqdm import tqdm_notebook as tqdm
# class MatrixFactorization(torch.nn.Module):
#   def __init__(self,n_users,n_items,n_factors=20):
#       super().__init__()
#       #embedding values
#       self.user_factors=torch.nn.Embedding(n_users,n_factors)
#       self.item_factors=torch.nn.Embedding(n_items,n_factors)
#       # b/w 0->0.05
#       self.user_factors.weight.data.uniform_(0,0.05)
#       self.item_factors.weight.data.uniform_(0,0.05)
#       #put data
#       def forward(self, data):
#         users, items = data[:, 0], data[:, 1]
#         return (self.user_factors(users) * self.item_factors(items)).sum(1)

#       def predict(self, user, item):
#         return self.forward(torch.tensor([[user, item]]))
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        #embedding values
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # b/w 0->0.05
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    def predict(self, user, item):
        return self.forward(user, item)

In [31]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [32]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0049, 0.0464, 0.0213,  ..., 0.0261, 0.0100, 0.0389],
        [0.0117, 0.0103, 0.0484,  ..., 0.0107, 0.0141, 0.0149],
        [0.0265, 0.0485, 0.0051,  ..., 0.0317, 0.0114, 0.0220],
        ...,
        [0.0371, 0.0302, 0.0277,  ..., 0.0045, 0.0366, 0.0161],
        [0.0250, 0.0357, 0.0180,  ..., 0.0177, 0.0436, 0.0244],
        [0.0124, 0.0179, 0.0328,  ..., 0.0175, 0.0486, 0.0003]])
item_factors.weight tensor([[0.0013, 0.0182, 0.0479,  ..., 0.0404, 0.0496, 0.0264],
        [0.0405, 0.0459, 0.0302,  ..., 0.0316, 0.0483, 0.0391],
        [0.0133, 0.0227, 0.0083,  ..., 0.0229, 0.0482, 0.0018],
        ...,
        [0.0167, 0.0397, 0.0326,  ..., 0.0314, 0.0020, 0.0203],
        [0.0186, 0.0128, 0.0123,  ..., 0.0351, 0.0019, 0.0084],
        [0.0051, 0.0067, 0.0065,  ..., 0.0443, 0.0383, 0.0046]])


In [33]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.055665224941855
iter #1 Loss: 4.7355898803260725
iter #2 Loss: 2.4719182730931317
iter #3 Loss: 1.7203556482259392
iter #4 Loss: 1.3452627236770494
iter #5 Loss: 1.1282564162602884
iter #6 Loss: 0.9911824647091367
iter #7 Loss: 0.9001990157335543
iter #8 Loss: 0.8373006047482418
iter #9 Loss: 0.7923154956495702
iter #10 Loss: 0.7593954129116184
iter #11 Loss: 0.7348760512425815
iter #12 Loss: 0.7158877621523014
iter #13 Loss: 0.7015963443720401
iter #14 Loss: 0.6903973235772346
iter #15 Loss: 0.6818590374843119
iter #16 Loss: 0.6749937386455269
iter #17 Loss: 0.6701215150846442
iter #18 Loss: 0.6659191184237524
iter #19 Loss: 0.6626495410343112
iter #20 Loss: 0.660890051442657
iter #21 Loss: 0.6588725992265692
iter #22 Loss: 0.6577936382160574
iter #23 Loss: 0.6568964122591285
iter #24 Loss: 0.655817092486142
iter #25 Loss: 0.6549379662586953
iter #26 Loss: 0.6546381114778785
iter #27 Loss: 0.653509720441351
iter #28 Loss: 0.6527295968102925
iter #29 Loss: 0.6515142640

In [35]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.7813,  1.2968,  0.8197,  ...,  1.2635,  0.6912,  1.4909],
        [ 0.9785,  1.8946,  1.7527,  ...,  0.8575,  0.7408,  0.4241],
        [ 1.4550,  1.7212,  0.2887,  ..., -1.7628,  1.8262, -0.6393],
        ...,
        [-0.7748,  1.2939,  0.4238,  ...,  2.6884,  0.9873,  0.4789],
        [ 0.5493,  1.0023,  1.3017,  ...,  1.0412,  1.4463,  0.3876],
        [ 1.3998,  0.9526,  1.5606,  ...,  1.6479,  1.4782,  1.0144]],
       device='cuda:0')
item_factors.weight tensor([[0.6878, 0.2468, 0.6149,  ..., 0.3814, 0.3908, 0.4962],
        [0.4078, 0.4611, 0.1653,  ..., 0.2162, 0.3968, 0.8617],
        [0.3492, 0.3684, 0.5914,  ..., 0.7954, 0.3991, 0.4390],
        ...,
        [0.3488, 0.3711, 0.3660,  ..., 0.3642, 0.3351, 0.3490],
        [0.4204, 0.4142, 0.4160,  ..., 0.4375, 0.4048, 0.4089],
        [0.3780, 0.3806, 0.3809,  ..., 0.4183, 0.4122, 0.3752]],
       device='cuda:0')


In [36]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [37]:
len(trained_movie_embeddings)

9724

In [38]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [39]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid])
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])


Cluster #0
	 Stargate (1994)
	 Batman Forever (1995)
	 GoldenEye (1995)
	 Waterworld (1995)
	 Net, The (1995)
	 Clueless (1995)
	 Crimson Tide (1995)
	 Cliffhanger (1993)
	 While You Were Sleeping (1995)
	 Demolition Man (1993)
Cluster #1
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Memento (2000)
	 Dark Knight, The (2008)
	 Inception (2010)
	 Eternal Sunshine of the Spotless Mind (2004)
	 Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
	 Willy Wonka & the Chocolate Factory (1971)
	 Fifth Element, The (1997)
	 Breakfast Club, The (1985)
Cluster #2
	 Jurassic Park (1993)
	 Apollo 13 (1995)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Fugitive, The (1993)
	 Batman (1989)
	 Aladdin (1992)
	 Back to the Future (1985)
	 Speed (1994)
	 Shrek (2001)
	 Men in Black (a.k.a. MIB) (1997)
Cluster #3
	 Pulp Fiction (1994)
	 Fight Club (1999)
	 American Beauty (1999)
	 Seven (a.k.a. Se7en) (1995)
	 Fargo (1996)
	 Ace Ventura: