In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import pandas as pd
import numpy as np
import zipfile
from sklearn.cluster import KMeans

In [3]:
!curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  2112k      0 --:--:-- --:--:-- --:--:-- 2113k


In [4]:
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
  zip_ref.extractall('data')

In [5]:
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [6]:
print("movie dim", movies_df.shape)
print("rating dim", ratings_df.shape)

movie dim (9742, 3)
rating dim (100836, 4)


In [7]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
num_users = len(ratings_df['userId'].unique())
num_movies = len(ratings_df['movieId'].unique())
print("num users", num_users)
print("num movies", num_movies)

num users 610
num movies 9724


In [73]:
trainset = ratings_df.sample(frac=0.9, random_state=42).copy()
valset = ratings_df.drop(trainset.index).copy()

In [13]:
trainset.head()

Unnamed: 0,userId,movieId,rating,timestamp
67037,432,77866,4.5,1335139641
42175,288,474,3.0,978465565
93850,599,4351,3.0,1498524542
6187,42,2987,4.0,996262677
12229,75,1610,4.0,1158989841


In [14]:
valset.head()

Unnamed: 0,userId,movieId,rating,timestamp
2,1,6,4.0,964982224
5,1,70,3.0,964982400
15,1,260,5.0,964981680
18,1,333,5.0,964981179
20,1,356,4.0,964980962


In [15]:
class MovieDataset(torch.utils.data.Dataset):
  def __init__(self, data):
    self.ratings = data.copy()

    users = data['userId'].unique()
    movies = data['movieId'].unique()


    self.useridx = {idx:userid for idx, userid in enumerate(users)}
    self.movieidx = {idx:movieid for idx, movieid in enumerate(movies)}


    self.userid = {i:o for o,i in self.useridx.items()}
    self.movieid = {i:o for o,i in self.movieidx.items()}


    self.ratings['userId'] = data['userId'].apply(lambda x: self.userid[x])
    self.ratings['movieId'] = data['movieId'].apply(lambda x: self.movieid[x])

    self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
    self.y = self.ratings['rating'].values
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

  def __getitem__(self, idx):
    return (self.x[idx], self.y[idx])

  def __len__(self):
    return len(self.ratings)





In [92]:
class MFNet(nn.Module):
  def __init__(self, num_users, num_movies, embedding_dim=100, n_hidden=10):
    super(MFNet, self).__init__()
    self.user_embedding = nn.Embedding(num_users, embedding_dim)
    self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
    # self.fc1 = nn.Linear(embedding_dim * 2, n_hidden)
    # self.fc2 = nn.Linear(n_hidden, 1)
    # self.dropout = nn.Dropout(0.1)
    self.user_embedding.weight.data.uniform_(0, 0.05)
    self.movie_embedding.weight.data.uniform_(0, 0.05)

  def forward(self, data):
    u, m = data[:, 0], data[:, 1]
    # U = self.user_embedding(u)
    # M = self.movie_embedding(m)
    # x = F.relu(torch.cat([U, M], dim=1))
    # x = self.dropout(x)
    # x = F.relu(self.fc1(x))
    # # x = self.dropout(x)
    # x = self.fc2(x)
    x = self.user_embedding(u) * self.movie_embedding(m)
    x = torch.sum(x, dim=1)
    return x

In [106]:

num_epochs = 20
lr = 0.001
wd = 0.0
embedding_dims = 100
n_hiddens = 10
loss_fn = nn.MSELoss()
model = MFNet(num_users, num_movies, embedding_dim=embedding_dims, n_hidden=n_hiddens)
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

print(model)

train_data = MovieDataset(trainset)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)

val_data = MovieDataset(valset)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=128, shuffle=True)


MFNet(
  (user_embedding): Embedding(610, 100)
  (movie_embedding): Embedding(9724, 100)
)


In [18]:
def train(model, epochs):
  for epoch in range(num_epochs):
    model.train()
    losses = []
    for x, y in train_loader:
      x, y = x.cuda(), y.cuda()
      optimizer.zero_grad()
      outputs = model(x)
      loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
      losses.append(loss.item())
      loss.backward()
      optimizer.step()
    print("epoch: ", epoch, "loss: ", np.mean(losses))


In [20]:
def eval(model):
    model.eval()
    losses = []
    for x, y in val_loader:
      x, y = x.cuda(), y.cuda()
      outputs = model(x)
      loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
      losses.append(loss.item())
      print("loss: ", loss.item())
    print("test loss: ", np.mean(losses))

In [107]:
train(model, num_epochs)

epoch:  0 loss:  5.4858977528990405
epoch:  1 loss:  1.27419572934109
epoch:  2 loss:  0.8988967975877404
epoch:  3 loss:  0.7865644380639403
epoch:  4 loss:  0.7416574511524653
epoch:  5 loss:  0.7197201133026558
epoch:  6 loss:  0.7073938247599285
epoch:  7 loss:  0.6924232537050341
epoch:  8 loss:  0.6731529037858266
epoch:  9 loss:  0.6406617199920632
epoch:  10 loss:  0.592029386540226
epoch:  11 loss:  0.527434581467732
epoch:  12 loss:  0.45439398646018403
epoch:  13 loss:  0.382256243687927
epoch:  14 loss:  0.31382325508110276
epoch:  15 loss:  0.25383938718665977
epoch:  16 loss:  0.20253228802859194
epoch:  17 loss:  0.16082151966597702
epoch:  18 loss:  0.12714657324073345
epoch:  19 loss:  0.10073338125337163


In [108]:
eval(model)

loss:  1.6786715984344482
loss:  1.6981756687164307
loss:  1.763780117034912
loss:  2.2730917930603027
loss:  1.54646897315979
loss:  1.681121826171875
loss:  1.6069040298461914
loss:  1.5434966087341309
loss:  1.760547399520874
loss:  1.9509716033935547
loss:  2.2680881023406982
loss:  1.7274203300476074
loss:  1.3093281984329224
loss:  1.6872563362121582
loss:  1.6619701385498047
loss:  1.6239629983901978
loss:  2.1593713760375977
loss:  1.5329278707504272
loss:  2.1366043090820312
loss:  1.5907119512557983
loss:  2.006399154663086
loss:  2.0374765396118164
loss:  2.0668559074401855
loss:  1.8010542392730713
loss:  1.7798173427581787
loss:  1.8038911819458008
loss:  1.603805422782898
loss:  1.8615398406982422
loss:  1.7163420915603638
loss:  1.97629714012146
loss:  1.5696935653686523
loss:  1.6072969436645508
loss:  1.8493506908416748
loss:  1.7977094650268555
loss:  1.6147434711456299
loss:  2.050804376602173
loss:  1.887909173965454
loss:  1.474381685256958
loss:  2.019522905349731

In [109]:
trained_movie_embeddings = model.movie_embedding.weight.data.cpu().numpy()
trained_movie_embeddings

array([[ 0.09981471,  0.03969651,  0.04462568, ...,  0.02695642,
        -0.03373118,  0.11653633],
       [ 0.18127294,  0.23674685,  0.0755353 , ...,  0.1446833 ,
         0.19424823,  0.03849046],
       [-0.10271139,  0.10922439,  0.00395446, ...,  0.08323725,
         0.02373117,  0.11831955],
       ...,
       [ 0.04078632,  0.03475622,  0.02918532, ...,  0.04378961,
         0.02712233,  0.02780466],
       [ 0.03600198,  0.02674091,  0.03288035, ...,  0.02753108,
         0.01831576,  0.01672152],
       [ 0.01950852,  0.02720421,  0.0326177 , ...,  0.00537694,
         0.02998632,  0.02516214]], dtype=float32)

In [110]:
clusters = KMeans(n_clusters=10).fit_predict(trained_movie_embeddings)
for cluster in range(10):
  print("\nCluster #{}".format(cluster))
  for idx in np.where(clusters == cluster)[0][:10]:
    print(movies_df.iloc[idx]['title'])


Cluster #0
Heat (1995)
Copycat (1995)
When Night Is Falling (1995)
Friday (1995)
Fair Game (1995)
Black Sheep (1996)
Bottle Rocket (1996)
Down Periscope (1996)
Up Close and Personal (1996)
Awfully Big Adventure, An (1995)

Cluster #1
Get Shorty (1995)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Home for the Holidays (1995)
Young Poisoner's Handbook, The (1995)
Mallrats (1995)
Legends of the Fall (1994)
Nell (1994)
Pyromaniac's Love Story, A (1995)
Air Up There, The (1994)
Barcelona (1994)

Cluster #2
Money Train (1995)
City of Lost Children, The (Cité des enfants perdus, La) (1995)
Mortal Kombat (1995)
Mr. Holland's Opus (1995)
Two if by Sea (1996)
French Twist (Gazon maudit) (1995)
Kicking and Screaming (1995)
Crossing Guard, The (1995)
Hate (Haine, La) (1995)
City Hall (1996)

Cluster #3
Toy Story (1995)
Grumpier Old Men (1995)
Sabrina (1995)
Sudden Death (1995)
It Takes Two (1995)
Cry, the Beloved Country (1995)
Postman, The (Postino, Il) (1994)
From Dusk Till Dawn (1996)
Dunston Che