In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import math


In [None]:
%matplotlib inline

In [None]:
df_links = pd.read_csv('links.csv')
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

df_links, df_movies, df_ratings

In [None]:
df_tags = pd.read_csv('tags.csv')

tags = np.unique(list(map(lambda x: x.lower(), df_tags.iloc[:, 2].tolist()))).tolist()
tag_vocab = {tag: i for i, tag in enumerate(tags)}
tag_itos = {i: tag for tag, i in tag_vocab.items()}

print("unique tags:", len(tags))

df_tags.head()

In [None]:
for genre_str in df_movies['genres']:
    movie_genres = genre_str.split('|')
    for g in [x.lower().replace(' ', '_') for x in movie_genres]:
        if g not in df_movies.columns:
            df_movies.insert(loc=len(df_movies.columns), column=g, value=0)

for i in range(0, len(df_movies)):
    for g in [x.lower().replace(' ', '_') for x in df_movies['genres'][i].split('|')]:
        df_movies.at[i, g] = 1

df_movies = df_movies.drop(columns=['(no_genres_listed)'])
df_movies.head()

In [None]:
df_ratings.head()

In [None]:
class MovielensDataset(Dataset):
    def __init__(self, df_ratings):
        self.df_ratings = df_ratings

    def __len__(self):
        return len(self.df_ratings)
    
    def __getitem__(self, idx):        
        user_id = self.df_ratings.iloc[idx, 0]
        movie_id = self.df_ratings.iloc[idx, 1] 
        rating = self.df_ratings.iloc[idx, 2]
        
        return user_id, movie_id, rating
    
movielens_dataset = MovielensDataset(df_ratings)
movielens_dataset[0], len(movielens_dataset)

In [None]:
torch.cuda.empty_cache()

# Classic matrix factorization

In [None]:
class MatrixFactorizationModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, num_factors):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(num_users, num_factors)
        self.movie_embeddings = torch.nn.Embedding(num_movies, num_factors)

        # initialize embeddings so that the output activations are roughly 2.5 (mean of dataset)
        #
        # (x / 2)**2*num_factors = 2.5
        # (x / 2)**2 = 2.5 / num_factors
        # x / 2 = math.sqrt(2.5 / num_factors)
        # x = math.sqrt(2.5 / num_factors) * 2
        self.user_embeddings.weight.data.uniform_(0, math.sqrt(2.5 / num_factors) * 2)
        self.movie_embeddings.weight.data.uniform_(0, math.sqrt(2.5 / num_factors) * 2)

    def forward(self, user_id, movie_id):
        user_embedding = self.user_embeddings(user_id)
        movie_embedding = self.movie_embeddings(movie_id)
        
        return (user_embedding * movie_embedding).sum(dim=-1)


n_users = df_ratings['userId'].max() + 1
n_movies = df_ratings['movieId'].max() + 1
model = MatrixFactorizationModel(n_users, n_movies, 50)

t = torch.tensor(1).unsqueeze(0)
print("mean of embeddings: ", model.user_embeddings.weight.data.mean().item())
print("mean of dot product: ", (model.user_embeddings(t) * model.movie_embeddings(t)).mean(dim=-1).item())
print("mean of output: ", (model.user_embeddings(t) * model.movie_embeddings(t)).sum(dim=-1).item())

In [None]:
split = int(len(movielens_dataset) * 0.8)
train_data, valid_data = torch.utils.data.random_split(movielens_dataset, [split, len(movielens_dataset) - split], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_data, batch_size=64, shuffle=True, num_workers=0)

model = MatrixFactorizationModel(n_users, n_movies, 50)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model.to(device)

lr = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

def train():
    losses = []
    for i, batch in enumerate(train_loader):
        model.train()
        user_ids, movie_ids, ratings = batch
        y_hat = model(user_ids.to(device), movie_ids.to(device)).squeeze()
        loss = F.mse_loss(y_hat, ratings.float().to(device))
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        optimizer.zero_grad()

    scheduler.step()

    return np.mean(losses)

@torch.no_grad()
def validate():
    losses = []
    for i, batch in enumerate(valid_loader):
        model.eval()
        user_ids, movie_ids, ratings = batch
        y_hat = model(user_ids.to(device), movie_ids.to(device)).squeeze()
        loss = F.mse_loss(y_hat, ratings.float().to(device))
        losses.append(loss.item())

    return np.mean(losses)

train_loss = []
valid_loss = []

for e in range(5):
    train_loss.append(train())
    valid_loss.append(validate())
    print(f'Epoch {e}, train loss: {train_loss[-1]}, valid loss: {valid_loss[-1]}')

In [None]:
  plt.plot(train_loss)
  plt.plot(valid_loss)
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.show()

In [None]:
movies = df_movies[:50]['movieId'].tolist()
titles = df_movies[:50]['title'].tolist()

movie_embeddings = model.movie_embeddings(torch.tensor(movies).to(device)).detach().cpu()
t = torch.pca_lowrank(movie_embeddings, q=2)

X = t[0][:, 0]
Y = t[0][:, 1]

fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(X, Y)

for i, title in enumerate(titles):
    ax.annotate(title, (X[i], Y[i]))

# Deep Collaborative Model

In [None]:
a = torch.fill(torch.randn(1, 19), 0.1)
b = torch.fill(torch.randn(1, 100), 2.5)

print(torch.cat((a, b), dim=-1).mean())

In [None]:
class DeepCollaborativeFilteringModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, num_factors):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(num_users, num_factors)
        self.movie_embeddings = torch.nn.Embedding(num_movies, num_factors)
        self.ffnn = torch.nn.Sequential(
            torch.nn.Linear(num_factors * 2, num_factors),
            torch.nn.Dropout(p=0.2),
            torch.nn.ReLU(),
            torch.nn.Linear(num_factors, 1),
        )


        for p in self.ffnn.parameters():
          if p.dim() > 1:
            torch.nn.init.kaiming_uniform_(p, nonlinearity="relu")

    def forward(self, user_id, movie_id):
        user_embedding = self.user_embeddings(user_id)
        movie_embedding = self.movie_embeddings(movie_id)
        x = torch.cat((user_embedding, movie_embedding), dim=-1)
        x = self.ffnn(x)
        return torch.sigmoid(x) * 5.5

model = DeepCollaborativeFilteringModel(n_users, n_movies, 50)
print("Mean: ", model(torch.ones(64, dtype=int), torch.ones(64, dtype=int)).mean().item())

In [None]:
#
# Add a forward hook to the model
#
activation = {}

def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

layers = [module for module in model.modules() if not isinstance(module, torch.nn.ReLU)]

for l in layers:
    l.register_forward_hook(get_activation(l))

# perform a forward pass
model(torch.arange(128).unsqueeze(0), torch.arange(128).unsqueeze(0))

#
# visualize histograms
#
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, key in enumerate(activation): # note: exclude the output layer
  t = activation[key]
  print('layer %d (%10s): mean %+.2f, std %.2f, saturated: %.2f%%' % (i, key, t.mean(), t.std(), (t.abs() > 0.97).float().mean()*100))
  hy, hx = torch.histogram(t, density=True)
  plt.plot(hx[:-1].detach(), hy.detach())
  legends.append(f'layer {i} ({key}')
plt.legend(legends);
plt.title('activation distribution')

In [None]:
torch.cuda.empty_cache()

In [None]:
split = int(len(movielens_dataset) * 0.8)
train_data, valid_data = torch.utils.data.random_split(movielens_dataset, [split, len(movielens_dataset) - split], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_data, batch_size=128, shuffle=True, num_workers=0)

n_users = df_ratings['userId'].max() + 1
n_movies = df_ratings['movieId'].max() + 1
model = DeepCollaborativeFilteringModel(n_users, n_movies, 100)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model.to(device)

lr = 3e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

def train():
    losses = []
    for i, batch in enumerate(train_loader):
        model.train()
        user_ids, movie_ids, ratings = batch
        y = ratings.float().to(device) #((ratings * 2) - 1).to(torch.int64).to(device)
        y_hat = model(user_ids.to(device), movie_ids.to(device)).squeeze()
        loss = F.mse_loss(y_hat, y)
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        optimizer.zero_grad()

    scheduler.step()

    return np.mean(losses)

@torch.no_grad()
def validate():
    losses = []
    for i, batch in enumerate(valid_loader):
        model.eval()
        user_ids, movie_ids, ratings = batch
        y = ratings.float().to(device) # ((ratings * 2) - 1).to(torch.int64).to(device)
        y_hat = model(user_ids.to(device), movie_ids.to(device)).squeeze()
        loss = F.mse_loss(y_hat, y)
        losses.append(loss.item())

    return np.mean(losses)

train_loss = []
valid_loss = []

for e in range(5):
    train_loss.append(train())
    valid_loss.append(validate())
    print(f'Epoch {e}, train loss: {train_loss[-1]}, valid loss: {valid_loss[-1]}')

# Concatenating features and embeddings

Building on the model from https://arxiv.org/abs/1606.07792, now using other features such as category.

<img src="https://github.com/Klingefjord/notebooks/blob/main/wide_n_deep.png?raw=1" height="400">

In [None]:
df_movies.head()

In [None]:
class MovielensCategoriesDataset(Dataset):
    def __init__(self, df_ratings, df_movies):
        self.df_ratings = df_ratings
        self.df_movies = df_movies

    def __len__(self):
        return len(self.df_ratings)
    
    def __getitem__(self, idx):        
        user_id = self.df_ratings.iloc[idx, 0]
        movie_id = self.df_ratings.iloc[idx, 1] 
        rating = self.df_ratings.iloc[idx, 2]
        categories = self.df_movies.loc[self.df_movies['movieId'] == movie_id].iloc[:, 3:].values
        categories = torch.tensor(categories[0].tolist())        
        return user_id, movie_id, float(rating), categories
    
categories_dataset = MovielensCategoriesDataset(df_ratings, df_movies)
categories_dataset[0]

In [None]:
test = torch.nn.Sequential(
    torch.nn.Linear(119, 50),
    torch.nn.Linear(50, 1)
)

print(test(torch.randn(119)).mean(), test(torch.randn(119)).std())

In [None]:
class MovielensCategoriesModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, num_factors, num_categories):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(num_users, num_factors)
        self.movie_embeddings = torch.nn.Embedding(num_movies, num_factors)
        
        self.ffnn = torch.nn.Sequential(
            torch.nn.Linear(num_factors * 2, out_features=num_factors),
            torch.nn.Dropout(p=0.2),
            torch.nn.ReLU(),
            torch.nn.Linear(num_factors, 1)
        )

        for p in self.ffnn.parameters():
          if p.dim() > 1:
            torch.nn.init.kaiming_uniform_(p, nonlinearity="relu")

    # concat the embeddings with the categories vector 
    # and feed the result through a simple FFNN.
    def forward(self, user_id, movie_id, categories):
        user_embedding = self.user_embeddings(user_id)
        movie_embedding = self.movie_embeddings(movie_id)
        x = torch.cat([(user_embedding * movie_embedding), categories], dim=-1)
        x = self.ffnn(x)
        return torch.sigmoid(x) * 5.5

model = MovielensCategoriesModel(n_users, n_movies, 100, 19)
y_hat = model(torch.arange(128), torch.arange(128), torch.zeros(128, 19).unsqueeze(0))
y_hat.mean().item(), y_hat.std().item()

In [None]:
torch.softmax(torch.tensor([0,0,1,1]), dim=0)

In [None]:
#
# Add a forward hook to the model
#
activation = {}

def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

model = MovielensCategoriesModel(n_users, n_movies, 50, 19)
model.eval()

layers = [module for module in model.modules() if not isinstance(module, torch.nn.ReLU)]

for l in layers:
    l.register_forward_hook(get_activation(l))

# perform a forward pass
model(torch.arange(256), torch.arange(256), torch.randn(256, 19))

#
# visualize histograms
#
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, key in enumerate(activation): # note: exclude the output layer
  t = activation[key]
  print('layer %d (%10s): mean %+.2f, std %.2f, saturated: %.2f%%' % (i, key, t.mean(), t.std(), (t.abs() > 0.97).float().mean()*100))
  hy, hx = torch.histogram(t, density=True)
  plt.plot(hx[:-1].detach(), hy.detach())
  legends.append(f'layer {i} ({key}')
plt.legend(legends);
plt.title('activation distribution')

In [None]:
torch.Generator().manual_seed(42)

split = int(len(categories_dataset) * 0.8)
train_data, valid_data = torch.utils.data.random_split(categories_dataset, [split, len(movielens_dataset) - split])

#train_data = torch.utils.data.Subset(train_data, torch.arange(100))
#valid_data = torch.utils.data.Subset(valid_data, torch.arange(80))

train_loader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_data, batch_size=128, shuffle=True, num_workers=0)

n_users = df_ratings['userId'].max() + 1
n_movies = df_ratings['movieId'].max() + 1
model = MovielensCategoriesModel(n_users, n_movies, 100, 19)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

lr = 3e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)



def train():
    losses = []

    for batch in train_loader:
        model.train()
        user_ids, movie_ids, ratings, categories = batch
        y_hat = model(user_ids.to(device), movie_ids.to(device), categories.to(device)).squeeze()
        loss = F.mse_loss(y_hat, ratings.float().to(device))
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        optimizer.zero_grad()

    return np.mean(losses)

@torch.no_grad()
def validate():
    losses = []
    for batch in valid_loader:
        model.eval()
        user_ids, movie_ids, ratings, categories = batch
        y_hat = model(user_ids.to(device), movie_ids.to(device), categories.to(device)).squeeze()
        loss = F.mse_loss(y_hat, ratings.float().to(device))
        losses.append(loss.item())

    return np.mean(losses)

train_loss = []
valid_loss = []

for e in range(5):
    train_loss.append(train())
    valid_loss.append(validate())
    print(f'Epoch {e}, train loss: {train_loss[-1]}, valid loss: {valid_loss[-1]}')

In [None]:
plt.plot(train_loss)
plt.plot(valid_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()