In [1]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [2]:
path = "data/"
device = "cuda"
seed = 42

In [3]:
df = pd.read_csv(path + "ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movie_id = df["movieId"].values
user_id = df["userId"].values
rating = df["rating"].values.astype(np.float32)

In [5]:
def norm_index(array_id):
    id_normed = {}
    ids = 0

    for i in range(0, len(array_id)):
        val = array_id[i]

        if val not in id_normed:
            id_normed[val] = ids
            ids += 1

        array_id[i] = id_normed[val] 
    
    return id_normed, array_id

# Re-index since they are not "compact"
movie_id_normed, movie_id = norm_index(movie_id)
user_id_normed, user_id = norm_index(user_id)

In [6]:
class ratings_dataset(Dataset):
    def __init__(self, movie_id, user_id, rating):
        self.len = len(movie_id)
        self.movie_id = torch.from_numpy(movie_id).to(device)
        self.user_id = torch.from_numpy(user_id).to(device)
        self.rating = torch.from_numpy(rating).to(device)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.movie_id[idx], self.user_id[idx], self.rating[idx]

In [7]:
np.random.seed(seed)
val_size = 0.2
idx = np.arange(0, len(rating))
np.random.shuffle(idx)

val_idx = idx[0:int(val_size * len(idx))]
train_idx = idx[int(val_size * len(idx)):]

In [8]:
batch_size = 128
train_set = ratings_dataset(movie_id[train_idx], user_id[train_idx], rating[train_idx])
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)

val_set = ratings_dataset(movie_id[val_idx], user_id[val_idx], rating[val_idx])
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=0)

In [9]:
max_rating = torch.from_numpy(np.array([np.max(rating)])).to(device)
min_rating = torch.from_numpy(np.array([np.min(rating)])).to(device)

In [10]:
class collaborative_filtering(nn.Module):
    def __init__(self, n_factors, n_users, n_movies):
        super().__init__()
        self.u_weight = nn.Embedding(n_users, n_factors).to(device)
        self.u_weight.weight.data.uniform_(-0.01, 0.01)
        
        self.u_bias = nn.Embedding(n_users, 1).to(device)
        self.u_bias.weight.data.uniform_(-0.01, 0.01)
        
        self.m_weight = nn.Embedding(n_movies, n_factors).to(device)
        self.m_weight.weight.data.uniform_(-0.01, 0.01)
        
        self.m_bias = nn.Embedding(n_movies, 1).to(device)
        self.m_bias.weight.data.uniform_(-0.01, 0.01)
        
    def forward(self, users, movies):
        dot = (self.u_weight(users) * self.m_weight(movies)).sum(1)
        res = dot + self.u_bias(users).squeeze() + self.m_bias(movies).squeeze()
        res = F.sigmoid(res) * (max_rating - min_rating) + min_rating
        return res

In [11]:
torch.manual_seed(seed)
n_users = len(set(user_id))
n_movies = len(set(movie_id))
n_factors = 50

lr = 5e-3
num_epochs = 10
weight_decay = 1e-3

model = collaborative_filtering(n_factors, n_users, n_movies).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [12]:
torch.manual_seed(seed)
for epoch in range(0, num_epochs):
    for _, (movie_i, user_i, rating_target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(user_i, movie_i)
        loss = F.mse_loss(output, rating_target)
        loss.backward()
        optimizer.step()
    
    avg_loss = 0
    for _, (movie_i, user_i, rating_target) in enumerate(val_loader):
        output = model(user_i, movie_i)
        loss = F.mse_loss(output, rating_target)
        avg_loss += loss
        
    avg_loss = round(avg_loss.item() / len(train_loader), 4)
    print("Val loss at epoch {}: {}".format(epoch+1, avg_loss))

Val loss at epoch 1: 0.2319
Val loss at epoch 2: 0.2212
Val loss at epoch 3: 0.2193
Val loss at epoch 4: 0.2188
Val loss at epoch 5: 0.2189
Val loss at epoch 6: 0.2181
Val loss at epoch 7: 0.2186
Val loss at epoch 8: 0.2186
Val loss at epoch 9: 0.2186
Val loss at epoch 10: 0.2185
