In [None]:
import os

if not os.path.exists("/content/data"):

  # Mount your Google Drive.
  from google.colab import drive
  drive.mount("/content/drive")

  # kaggle_creds_path = "/content/drive/Colab Notebooks"

  ! pip install kaggle --quiet

  ! mkdir ~/.kaggle
  ! cp "/content/drive/MyDrive/Colab Notebooks/kaggle.json" ~/.kaggle/
  ! chmod 600 ~/.kaggle/kaggle.json

  ! kaggle datasets download -d hernan4444/anime-recommendation-database-2020
  ! mkdir data
  ! unzip anime-recommendation-database-2020.zip -d data

  # Unmount your Google Drive
  # drive.flush_and_unmount()

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/rating_complete.csv')

In [None]:
# counts = df['user_id'].value_counts()
# (counts > 100).sum()
# (df['anime_id'].value_counts() > 100000).sum()
# df['anime_id'].nunique()

In [None]:
if False:
  user_min_ratings = 500
  anime_min_ratings = 10000

  # Filter out users with less than user_min_ratings ratings
  user_counts = df['user_id'].value_counts()
  filtered_users = user_counts[user_counts >= user_min_ratings].index
  df = df[df['user_id'].isin(filtered_users)]

  # Filter out animes with less than anime_min_ratings ratings
  anime_counts = df['anime_id'].value_counts()
  filtered_animes = anime_counts[anime_counts >= anime_min_ratings].index
  df = df[df['anime_id'].isin(filtered_animes)]

# Remap user_id and anime_id
user_id_mapping = {id: i for i, id in enumerate(df['user_id'].unique())}
anime_id_mapping = {id: i for i, id in enumerate(df['anime_id'].unique())}
df['user_id'] = df['user_id'].map(user_id_mapping)
df['anime_id'] = df['anime_id'].map(anime_id_mapping)

# Rating matrix TOO BIG FOR UNFILTERED DATASET
# rating_matrix = df.pivot(index='user_id', columns='anime_id', values='rating') #.fillna(0)
# x = rating_matrix.to_numpy()

In [None]:
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n = df.shape[0]
n_users, n_animes = df['user_id'].nunique(), df['anime_id'].nunique()

np.random.seed(42)
split = np.random.permutation(n)

xy = torch.from_numpy(df.to_numpy()).to(device)  # columns ['user_id', 'anime_id', 'rating']
x, y = xy[:, :2], xy[:, 2].float()

y = y * 0.1  # RESCALE to [0, 1]

def train_val_test_split(t):
  t_train = t[split[:int(n*0.8)]]
  t_val = t[split[int(n*0.8):int(n*0.9)]]
  t_test = t[split[int(n*0.9):]]
  return t_train, t_val, t_test

x_train, x_val, x_test = train_val_test_split(x)
y_train, y_val, y_test = train_val_test_split(y)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# from https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014
class FastTensorDataLoader:
    """
    A DataLoader-like object for a set of tensors that can be much faster than
    TensorDataset + DataLoader because dataloader grabs individual indices of
    the dataset and calls cat (slow).
    """
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        """
        Initialize a FastTensorDataLoader.

        :param *tensors: tensors to store. Must have the same length @ dim 0.
        :param batch_size: batch size to load.
        :param shuffle: if True, shuffle the data *in-place* whenever an
            iterator is created out of this object.

        :returns: A FastTensorDataLoader.
        """
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches

    def __iter__(self):
        if self.shuffle:
            self.indices = torch.randperm(self.dataset_len)
        else:
            self.indices = None
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        if self.indices is not None:
            indices = self.indices[self.i:self.i+self.batch_size]
            batch = tuple(torch.index_select(t, 0, indices) for t in self.tensors)
        else:
            batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches

batch_size = 1_000_000
train_dataloader = FastTensorDataLoader(x_train, y_train, batch_size=batch_size, shuffle=False)
val_dataloader = FastTensorDataLoader(x_val, y_val, batch_size=batch_size, shuffle=False)
test_dataloader = FastTensorDataLoader(x_test, y_test, batch_size=batch_size, shuffle=False)

In [None]:
# Baseline: average rating
mse = nn.functional.mse_loss(torch.ones_like(y_test) * y_train.mean(), y_test)
print(f"Test MSE for using average rating as prediction: {mse}")

In [None]:
class NaiveLatentFactorModel(nn.Module):
    def __init__(self, n_users, n_animes, embedding_dim):
        super(NaiveLatentFactorModel, self).__init__()
        self.user_emb = nn.Embedding(num_embeddings=n_users, embedding_dim=embedding_dim)
        self.anime_emb = nn.Embedding(num_embeddings=n_animes, embedding_dim=embedding_dim)

    def forward(self, idx):
        # Compute the dot product between user and anime embeddings
        x_matrix = self.user_emb.weight @ self.anime_emb.weight.T

        # Return the selected indices
        return x_matrix[idx[:, 0], idx[:, 1]]

In [None]:
class LatentFactorModel(nn.Module):
    def __init__(self, n_users, n_animes, embedding_dim):
        super(LatentFactorModel, self).__init__()

        self.embedding_dim = embedding_dim
        self.user_emb = nn.Embedding(num_embeddings=n_users, embedding_dim=embedding_dim)
        self.anime_emb = nn.Embedding(num_embeddings=n_animes, embedding_dim=embedding_dim)

        self.user_bias = nn.Parameter(torch.zeros(n_users))
        self.anime_bias = nn.Parameter(torch.zeros(n_animes))

        self.dot_scale = nn.Parameter(torch.ones(1) * (1/embedding_dim))
        self.final_bias = nn.Parameter(torch.zeros(1))

    def forward(self, idx):
        if isinstance(idx, np.ndarray):
          idx = torch.from_numpy(idx)

        user_vectors = self.user_emb(idx[:, 0])
        anime_vectors = self.anime_emb(idx[:, 1])

        out = (user_vectors * anime_vectors).sum(dim=1) / self.embedding_dim  # * self.dot_scale  #

        out = out + self.user_bias[idx[:, 0]] + self.anime_bias[idx[:, 1]]
        #out = self.user_bias[idx[:, 0]] + self.anime_bias[idx[:, 1]]

        out = out + self.final_bias
        #out = torch.ones_like(self.user_bias[idx[:, 0]]) * self.final_bias

        return torch.sigmoid(out) # + self.bias

In [None]:
from tqdm.notebook import tqdm

def training_loop(model, num_epochs):
  #optimizer = torch.optim.AdamW(model.parameters(), lr=0.3, weight_decay=0.1)
  optimizer = torch.optim.AdamW([
    {'params': model.user_emb.parameters(), 'weight_decay': 0.1},
    {'params': model.anime_emb.parameters(), 'weight_decay': 0.1},
    {'params': [model.user_bias, model.anime_bias], 'weight_decay': 0.0},
    {'params': [model.dot_scale, model.final_bias], 'weight_decay': 0}  # min lr = 0.03, max = 0.5
  ], lr=0.03) # min lr 0.003

  for epoch in tqdm(range(num_epochs), desc="Epochs"):
    for x_batch, y_batch in tqdm(train_dataloader, desc="Steps", leave=False):
      optimizer.zero_grad()
      y_pred = model(x_batch)
      loss = nn.functional.mse_loss(y_pred, y_batch)
      loss.backward()
      optimizer.step()

    with torch.no_grad():
      val_losses = []
      for x_batch, y_batch in val_dataloader:
        y_pred = model(x_batch)
        val_losses.append(nn.functional.mse_loss(y_pred, y_batch))
      val_loss = torch.tensor(val_losses).mean()
    if epoch % 5 == 0:
      print(f"train loss: {loss.item()}, val loss: {val_loss.item()}, val rmse: {torch.sqrt(val_loss).item()}")
  print(f"train loss: {loss.item()}, val loss: {val_loss.item()}, val rmse: {torch.sqrt(val_loss).item()}")

In [None]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

def training_loop(model, num_epochs):
    #optimizer = torch.optim.AdamW(model.parameters(), lr=0.3, weight_decay=0.1)
    optimizer = torch.optim.AdamW([
        {'params': model.user_emb.parameters(), 'weight_decay': 0.1},
        {'params': model.anime_emb.parameters(), 'weight_decay': 0.1},
        {'params': [model.user_bias, model.anime_bias], 'weight_decay': 0.0},
        {'params': [model.dot_scale, model.final_bias], 'weight_decay': 0}  # min lr = 0.03, max = 0.5
    ], lr=0.03) # min lr 0.003

    train_losses = []
    val_losses = []

    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        epoch_losses = []
        for x_batch, y_batch in tqdm(train_dataloader, desc="Steps", leave=False):
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = nn.functional.mse_loss(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss.item())

        # calculate mean training loss for the epoch
        train_losses.append(sum(epoch_losses) / len(epoch_losses))

        # Validation
        with torch.no_grad():
            epoch_val_losses = []
            for x_batch, y_batch in val_dataloader:
                y_pred = model(x_batch)
                val_loss = nn.functional.mse_loss(y_pred, y_batch)
                epoch_val_losses.append(val_loss.item())
            # mean validation loss for the epoch
            val_losses.append(sum(epoch_val_losses) / len(epoch_val_losses))

        if epoch % 5 == 0 or epoch == num_epochs - 1:
            print(f"Epoch {epoch}: train loss: {train_losses[-1]}, val loss: {val_losses[-1]}")

    # plot the training and validation losses
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training loss')
    plt.plot(val_losses, label='Validation loss')
    plt.title('Training and Validation Losses Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
model = LatentFactorModel(n_users, n_animes, embedding_dim=32).to(device)

In [None]:
training_loop(model, 25)

In [None]:
anime_emb = model.anime_emb.weight.detach().cpu().numpy()

In [None]:
base_path = '/content/drive/MyDrive/tmp'

In [None]:
torch.save(model.state_dict(), f'{base_path}/model_full_emb32.pth')

In [None]:
state_dict = torch.load(f'{base_path}/model_full_emb32.pth')
anime_emb_matrix = state_dict['anime_emb.weight'].cpu().numpy()
np.save(f'{base_path}/anime_emb32.npy', anime_emb_matrix)

In [None]:
import numpy as np
anime_emb = np.load(f'{base_path}/anime_emb32.npy')

In [None]:
backup_ae = anime_emb
anime_emb.shape

In [None]:
# anime_emb = np.random.rand(backup_ae.shape[0], backup_ae.shape[1]) -0.5  # for sanity check using random values

In [None]:
anime_emb.shape

In [None]:
anime_df = pd.read_csv('data/anime.csv')

In [None]:
anime_df['anime_id'] = anime_df['MAL_ID'].map(anime_id_mapping)
anime_df = anime_df.dropna(subset=['anime_id'])
anime_df = anime_df.sort_values(['anime_id']).reset_index(drop=True)

In [None]:
anime_df['Genres'].str.contains('Action').to_numpy().sum()

In [None]:
def genre_mask(name):
    return anime_df['Genres'].str.contains(name).to_numpy()

In [None]:
action_emb = anime_emb[genre_mask('Action')].mean(axis=0)
romance_emb = anime_emb[genre_mask('Romance')].mean(axis=0)
action_emb /= np.linalg.norm(action_emb)
romance_emb /= np.linalg.norm(romance_emb)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = anime_emb @ action_emb
y = anime_emb @ romance_emb
action_mask = genre_mask('Action')
romance_mask = genre_mask('Romance')

c = np.full((anime_emb.shape[0], 3), [0.5, 0.5, 0.5])  # Gray color

c[action_mask & ~romance_mask] = [1, 0, 0]  # Red
c[~action_mask & romance_mask] = [0, 0, 1]  # Blue
c[action_mask & romance_mask] = [0/255, 180/255, 0/255]  # Green

fig, ax = plt.subplots(figsize=(4, 4))

ax.scatter(x, y, c=c, s=1)

ax.spines["left"].set_position(("data", 0))
ax.spines["bottom"].set_position(("data", 0))

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

ax.plot(1, 0, ">k", transform=ax.get_yaxis_transform(), clip_on=False)
ax.plot(0, 1, "^k", transform=ax.get_xaxis_transform(), clip_on=False)

ax.set_xticks([])
ax.set_yticks([])

ax.text(1.05, -1.4, 'Action', transform=ax.get_yaxis_transform(), ha='right', va='center')
ax.text(0.1, 0.9, 'Romance', transform=ax.get_xaxis_transform(), ha='center', va='bottom')

plt.savefig(f'{base_path}/school_hentai_plot.png', format='png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
import sklearn
reduced_data = sklearn.decomposition.PCA(n_components=2).fit_transform(anime_emb)

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=genre_mask('Magic'), s=1)
l = 5
plt.xlim(-5, 5)
plt.ylim(-5, 5)

In [None]:
from sklearn.cluster import KMeans

k_values = range(1, 20)

inertia = []
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(anime_emb)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.show()