In [None]:
import sqlite3
import pickle
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from glob import glob
from tqdm import tqdm
from IPython.core.debugger import set_trace
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split

In [None]:
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() 
                                  else "cpu")

In [None]:
mapping = pickle.load(open('datasets/user_anime_ratings_mapping.pkl', 'rb'))
mapping.keys()

In [None]:
num_users, num_anime = (108711, 6668)
batch_size = 1024

In [None]:
user_embeddings = pickle.load(open('user_embed_pytorch_nn_epoch4_embedding_fix_10.23-10.21.pkl', 'rb'))
user_embeddings.shape

In [None]:
from sklearn.decomposition.pca import PCA
pca = PCA(n_components=2)
pca.fit(user_embeddings.cpu().detach().numpy())

In [None]:
anime_embeddings = pickle.load(open('anime_embed_pytorch_nn_epoch4_embedding_fix_10.23-10.21.pkl', 'rb'))
anime_embeddings.shape

In [None]:
user_grouped_rating_files = [f for f in glob('datasets/user_grouped_anime_ratings.gz')]
user_grouped_rating_files.sort()

In [None]:
from functools import partial

def input_anime_embeddings(record, mapping, anime_embeddings):
    num_records = len(record['anime_id'])
    batch_anime_idx = np.array([mapping['anime2idx'][x] for x in record['anime_id']], dtype=np.int32)
    batch_anime_rating = np.array(record['my_score'], dtype=np.int16)
    num_anime_watched = len(batch_anime_idx)
    sum_rating = np.sum(batch_anime_rating)
    sum_rating = 1 if sum_rating == 0 else sum_rating
    sum_neg_rating = np.sum(10 - batch_anime_rating)
    sum_neg_rating = 1 if sum_neg_rating == 0 else sum_neg_rating
    anime_sum = np.sum(
        anime_embeddings[batch_anime_idx] * batch_anime_rating.reshape(-1, 1), axis=0
    ).astype(np.float32)
    anime_neg_sum = np.sum(
        anime_embeddings[batch_anime_idx] * (10 - batch_anime_rating.reshape(-1, 1)), axis=0
    ).astype(np.float32)
    result_sum_rating =  anime_sum / sum_rating
    result_mean_rating = anime_sum / num_anime_watched
    result_sum_neg_rating = anime_neg_sum / sum_neg_rating
    result = np.concatenate([
        result_sum_rating,
        result_sum_neg_rating,
        result_mean_rating
    ])
    return result

def extract_required_format(record, pca, mapping, user_embeddings, anime_embeddings, device):
    return input_anime_embeddings(record, mapping, anime_embeddings), pca.transform([
            user_embeddings[mapping['user2idx'][record['user_id']]]
        ]).astype(np.float32)

transform = partial(extract_required_format, pca=pca, mapping=mapping,
                    user_embeddings=user_embeddings.cpu().detach().numpy(),
                    anime_embeddings=anime_embeddings.cpu().detach().numpy(), device='cuda')


In [None]:
class AnimeRatingsDataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, sqlite_file, anime_embeddings, user_embeddings, mapping, pca, transform=None):
        self.df = pd.read_pickle(sqlite_file).reset_index()
        self.anime_embeddings = anime_embeddings.cpu().detach().numpy()
        self.user_embeddings = user_embeddings.cpu().detach().numpy()
        self.mapping = mapping
        self.pca = pca
        self.length = self.df.shape[0] - 1

    def input_anime_embeddings(self, record):
        num_records = len(record['anime_id'])
        batch_anime_idx = np.array([self.mapping['anime2idx'][x] for x in record['anime_id']], dtype=np.int32)
        batch_anime_rating = np.array(record['my_score'], dtype=np.int16)
        num_anime_watched = len(batch_anime_idx)
        sum_rating = np.sum(batch_anime_rating)
        sum_rating = 1 if sum_rating == 0 else sum_rating
        sum_neg_rating = np.sum(10 - batch_anime_rating)
        sum_neg_rating = 1 if sum_neg_rating == 0 else sum_neg_rating
        anime_sum = np.sum(
            self.anime_embeddings[batch_anime_idx] * batch_anime_rating.reshape(-1, 1), axis=0
        ).astype(np.float32)
        anime_neg_sum = np.sum(
            self.anime_embeddings[batch_anime_idx] * (10 - batch_anime_rating.reshape(-1, 1)), axis=0
        ).astype(np.float32)
        result_sum_rating =  anime_sum / sum_rating
        result_mean_rating = anime_sum / num_anime_watched
        result_sum_neg_rating = anime_neg_sum / sum_neg_rating
        result = np.concatenate([
            result_sum_rating,
            result_sum_neg_rating,
            result_mean_rating
        ])
        return result

    def extract_required_format(self, record):
        return self.input_anime_embeddings(record), self.pca.transform([
                self.user_embeddings[self.mapping['user2idx'][record['user_id']]]
            ]).astype(np.float32)

    def __getitem__(self, index):
        if isinstance(index, torch.Tensor):
            index = int(index)
        return self.extract_required_format(self.df.iloc[index])

    def __len__(self):
        return self.length

In [None]:
total_dataset = ConcatDataset([
    AnimeRatingsDataset(
        f,
        mapping=mapping,
        anime_embeddings=anime_embeddings,
        user_embeddings=user_embeddings,
        pca=pca
    ) for f in user_grouped_rating_files
])

train_size = int(len(total_dataset) * 0.8)
test_size = int(len(total_dataset) * 0.2)
total = sum([train_size, test_size])
diff = len(total_dataset) - total
train_dataset, test_dataset = random_split(total_dataset, (train_size + diff, test_size))

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=0
)

test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=0
)

In [None]:
class Net(nn.Module):

    def __init__(self, anime_embedding_dim, batch_size=batch_size):
        super(Net, self).__init__()

        self.anime_embedding_dim = anime_embedding_dim
        self.batch_size = batch_size

        self.fc1 = nn.Linear(3 * anime_embedding_dim, 100)
        self.ln1 = nn.LayerNorm(100)
        self.drop1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(100, 50)
        self.ln2 = nn.LayerNorm(50)
        self.drop2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(50, 2)


    def forward(self, x):
        # Autoencoder encoder stage
        l1_out = self.drop1(self.ln1(F.relu(self.fc1(x))))
        l2_out = self.drop2(self.ln2(F.relu(self.fc2(l1_out))))
        l3_out = self.fc3(l2_out)
        return l3_out


model = Net(anime_embedding_dim=50)
model.to(device)
print(model)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train(num_epochs, model, optimizer, criterion):
    train_loss = []
    validation_loss = []
    for epoch in range(num_epochs):
        print('Running epoch {}'.format(epoch + 1))
        train_epoch_loss = []
        validation_epoch_loss = []
        model = model.train()
        for param in model.parameters():
            param.requires_grad = True
        # Model Training
        for idx, (X, y) in enumerate(train_dataloader):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            X = X.to(device)
            y = y.to(device)
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of anime indices.
            #record = torch.from_numpy(np.array([ 3, 23, 43, 53,  5,  4,  3, 67], dtype=np.int64)).to(device)

            prediction = model(X)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = criterion(prediction, y)
            loss.backward()
            optimizer.step()
            train_epoch_loss.append(float(loss))
            if idx % 40 == 0:
                print('Batch {} - Training loss: {}'.format(idx + 1, loss))
            del loss
            del prediction


        with torch.no_grad():
            model = model.eval()
            for param in model.parameters():
                param.requires_grad = False
            for idx, (X, y) in enumerate(test_dataloader):
                # Step 1. Remember that Pytorch accumulates gradients.
                # We need to clear them out before each instance
                model.zero_grad()

                X = X.to(device)
                y = y.to(device)
                # Step 2. Get our inputs ready for the network, that is, turn them into
                # Tensors of anime indices.
                #record = torch.from_numpy(np.array([ 3, 23, 43, 53,  5,  4,  3, 67], dtype=np.int64)).to(device)

                prediction = model(X)

                # Step 4. Compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = criterion(prediction, y)
                validation_epoch_loss.append(float(loss))
                if idx % 10 == 0:
                    print('Batch {} - Validation loss: {}'.format(idx + 1, loss))
                del loss
                del prediction
            model = model.train()

        train_loss.append(np.mean(train_epoch_loss))
        validation_loss.append(np.mean(validation_epoch_loss))
        print('Epoch {}: Mean training loss: {} Mean validation loss: {}'.format(epoch + 1, train_loss[-1], validation_loss[-1]))

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(num_epochs=15, optimizer=optimizer, model=model, criterion=criterion)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
train(num_epochs=15, optimizer=optimizer, model=model, criterion=criterion)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
train(num_epochs=15, optimizer=optimizer, model=model, criterion=criterion)

In [None]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(5, 0.4).fit(pca.transform(user_embeddings.cpu().detach().numpy()))

In [None]:
def closest_users(anime_history, anime_ratings):
    a = total_dataset.datasets[0]
    with torch.no_grad():
        prediction_model = model.eval()
        for param in prediction_model.parameters():
            param.requires_grad = False
        input_embeddings = a.input_anime_embeddings({
            'anime_id': np.array(anime_history),
            'my_score': np.array(anime_ratings)
        })
        predicted_embeddings = prediction_model(torch.from_numpy(input_embeddings).view(1, -1).to(device))
        return neigh.kneighbors(predicted_embeddings.cpu().detach().numpy(), 2, return_distance=True)

In [None]:
anime_id = [20, 21, 1, 121, 136]
titles = ['Naruto', 'One Piece', 'Cowboy Bebop', 'Fullmetal Alchemist', 'Hunter x Hunter']
ratings = [7.5, 7.2, 8, 8.2, 8.5]

dist, closest_users = closest_users(anime_id, ratings)
dist, closest_users