In [2]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1621k      0 --:--:-- --:--:-- --:--:-- 1619k


In [3]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [6]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [20]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

In [9]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [10]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [13]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0176, 0.0262, 0.0456,  ..., 0.0299, 0.0452, 0.0469],
        [0.0221, 0.0179, 0.0022,  ..., 0.0053, 0.0177, 0.0312],
        [0.0215, 0.0076, 0.0191,  ..., 0.0279, 0.0140, 0.0139],
        ...,
        [0.0474, 0.0375, 0.0351,  ..., 0.0264, 0.0450, 0.0228],
        [0.0440, 0.0220, 0.0468,  ..., 0.0254, 0.0289, 0.0247],
        [0.0023, 0.0468, 0.0142,  ..., 0.0280, 0.0459, 0.0090]])
item_factors.weight tensor([[0.0140, 0.0464, 0.0302,  ..., 0.0364, 0.0183, 0.0265],
        [0.0037, 0.0172, 0.0456,  ..., 0.0440, 0.0080, 0.0076],
        [0.0101, 0.0061, 0.0160,  ..., 0.0482, 0.0282, 0.0455],
        ...,
        [0.0211, 0.0132, 0.0317,  ..., 0.0198, 0.0031, 0.0138],
        [0.0364, 0.0226, 0.0367,  ..., 0.0369, 0.0068, 0.0272],
        [0.0162, 0.0161, 0.0308,  ..., 0.0101, 0.0027, 0.0029]])


In [14]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.061406864127532
iter #1 Loss: 4.753057855034843
iter #2 Loss: 2.4790240341636736
iter #3 Loss: 1.7230067346906903
iter #4 Loss: 1.3466473499379181
iter #5 Loss: 1.1286359407423716
iter #6 Loss: 0.991532363564835
iter #7 Loss: 0.9004663449102247
iter #8 Loss: 0.8371407532601187
iter #9 Loss: 0.7923644043996855
iter #10 Loss: 0.7589838494201602
iter #11 Loss: 0.7348762601613998
iter #12 Loss: 0.7160297317931494
iter #13 Loss: 0.7016714270117924
iter #14 Loss: 0.6902654071144646
iter #15 Loss: 0.6816527323069306
iter #16 Loss: 0.6749640975506778
iter #17 Loss: 0.6693908849419071
iter #18 Loss: 0.6659004348259286
iter #19 Loss: 0.6625959403094301
iter #20 Loss: 0.660437759409096
iter #21 Loss: 0.6589065926329134
iter #22 Loss: 0.6574845421435264
iter #23 Loss: 0.6564133587449336
iter #24 Loss: 0.6555166795169036
iter #25 Loss: 0.6546754127150864
iter #26 Loss: 0.6536313985583141
iter #27 Loss: 0.6528482535466325
iter #28 Loss: 0.6515921574029221
iter #29 Loss: 0.6504349321

In [15]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.5516,  0.9036,  0.9958,  ...,  1.5238,  0.8910,  1.4335],
        [ 1.1734,  2.4320,  0.0904,  ...,  0.0296,  1.3883,  1.7327],
        [ 1.7465,  0.1575, -1.1005,  ..., -1.1487, -1.2643,  2.3014],
        ...,
        [ 1.5832,  0.0089,  1.0928,  ...,  1.6140,  2.4329,  0.5983],
        [ 0.8168,  1.1328,  0.7828,  ...,  1.1378,  1.2121,  0.4682],
        [ 1.1434,  2.0293,  1.1495,  ...,  0.7112,  0.6585,  1.5741]],
       device='cuda:0')
item_factors.weight tensor([[ 0.5281,  0.5306,  0.4984,  ...,  0.3576,  0.2877,  0.8186],
        [-0.1181,  0.8020,  0.6135,  ...,  0.2927,  0.2457,  0.0472],
        [ 0.2706,  0.7215,  0.6235,  ...,  0.5761,  0.6409,  0.5592],
        ...,
        [ 0.3470,  0.3395,  0.3583,  ...,  0.3379,  0.3245,  0.3400],
        [ 0.4105,  0.3980,  0.4117,  ...,  0.4129,  0.3800,  0.4023],
        [ 0.4047,  0.4047,  0.4192,  ...,  0.4030,  0.3911,  0.3909]],
       device='cuda:0')


In [16]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [17]:
len(trained_movie_embeddings)

9724

In [18]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [21]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Independence Day (a.k.a. ID4) (1996)
	 Ace Ventura: Pet Detective (1994)
	 Inception (2010)
	 Terminator, The (1984)
	 Twister (1996)
	 Waterworld (1995)
	 Outbreak (1995)
	 Santa Clause, The (1994)
	 Dark Knight Rises, The (2012)
	 Django Unchained (2012)
Cluster #1
	 Jurassic Park (1993)
	 Toy Story (1995)
	 Apollo 13 (1995)
	 Fugitive, The (1993)
	 Batman (1989)
	 Aladdin (1992)
	 Lion King, The (1994)
	 Back to the Future (1985)
	 Speed (1994)
	 Gladiator (2000)
Cluster #2
	 Pulp Fiction (1994)
	 Terminator 2: Judgment Day (1991)
	 American Beauty (1999)
	 Godfather, The (1972)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Memento (2000)
	 Alien (1979)
	 Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
	 One Flew Over the Cuckoo's Nest (1975)
	 Kill Bill: Vol. 1 (2003)
Cluster #3
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Braveheart (1995)
	 Schindler's List (199