In [1]:
import sqlite3
import pickle
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from glob import glob
from tqdm import tqdm
from IPython.core.debugger import set_trace
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split

In [2]:
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() 
                                  else "cpu")

In [3]:
mapping = pickle.load(open('datasets/user_anime_ratings_mapping.pkl', 'rb'))
mapping.keys()

dict_keys(['user2idx', 'anime2idx'])

In [4]:
num_users, num_anime = (len(mapping['user2idx']), len(mapping['anime2idx']))
batch_size = 1024
num_users, num_anime

(108711, 6668)

In [None]:
user_grouped_rating_files = [f for f in glob('datasets/user_anime_ratings_db_split/user_anime_ratings_*.db')]
user_grouped_rating_files.sort()

In [None]:
class AnimeRatingsDataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, sqlite_file, length=None, transform=None):
        self.db = sqlite3.connect(sqlite_file)
        self.cursor = self.db.cursor()
        self.length = self.cursor.execute('SELECT count(user_id) from user_anime_ratings;').fetchone()[0] if length is None else length

    def __getitem__(self, index):
        if isinstance(index, torch.Tensor):
            index = int(index)
        _, user_id, anime_id, my_score = self.cursor.execute(
            'SELECT * from user_anime_ratings where rowid=?', (index + 1, )).fetchone()
        return np.array([
            mapping['user2idx'][user_id], mapping['anime2idx'][anime_id]
        ], dtype=np.long), np.array([my_score], dtype=np.float32)

    def __len__(self):
        return self.length

In [None]:
# import h5py

# class AnimeRatingsDataset(Dataset):
#     """Custom Dataset for loading entries from HDF5 databases"""

#     def __init__(self, sqlite_file, transform=None):
#         self.data = h5py.File(sqlite_file, 'r')['user_anime_ratings']
#         self.length = self.data['block0_values'].shape[0]

#     def __getitem__(self, index):
#         if isinstance(index, torch.Tensor):
#             index = int(index)
#         user_id, anime_id, my_score = self.data['block0_values'][index, :3]
#         return np.array([
#             mapping['user2idx'][user_id], mapping['anime2idx'][anime_id]
#         ], dtype=np.long), np.array([my_score], dtype=np.float32)

#     def __len__(self):
#         return self.length

In [None]:
total_dataset = ConcatDataset([AnimeRatingsDataset(f) for f in user_grouped_rating_files])

train_size = int(len(total_dataset) * 0.8)
test_size = int(len(total_dataset) * 0.2)
total = sum([train_size, test_size])
diff = len(total_dataset) - total
train_dataset, test_dataset = random_split(total_dataset, (train_size + diff, test_size))

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True
)

test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True
)

In [None]:
len(train_dataloader), len(test_dataloader)

(24441, 6111)

In [None]:
class Net(nn.Module):

    def __init__(self, num_anime, num_users, anime_embedding_size,
                 user_embedding_size, batch_size=batch_size):
        super(Net, self).__init__()

        self.num_anime = num_anime
        self.num_users = num_users
        self.anime_embedding_size = anime_embedding_size
        self.user_embedding_size = user_embedding_size
        self.batch_size = batch_size

        self.emb_user = nn.Embedding(num_users, user_embedding_size)
        self.emb_anime = nn.Embedding(num_anime, anime_embedding_size)
        self.ln1 = nn.LayerNorm(user_embedding_size + anime_embedding_size)
        self.drop1 = nn.Dropout(0.4)
        self.fc1 = nn.Linear(user_embedding_size + anime_embedding_size, 124)
        self.ln2= nn.LayerNorm(124)
        self.drop2 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(124, 1)

    def forward(self, x):
        batch_user_id, batch_anime_id = x[:, 0], x[:, 1]
        anime_embeddings = self.emb_anime(batch_anime_id)
        user_embeddings = self.emb_user(batch_user_id)
        combined_embeddings = torch.cat([
            anime_embeddings,
            user_embeddings
        ], dim=1)
        fc1 = self.fc1(self.drop1(self.ln1(combined_embeddings)))
        fc2 = self.fc2(self.drop2(self.ln2(fc1)))
#         fc1 = self.fc1(self.drop1(combined_embeddings))
#         fc2 = self.fc2(self.drop2(fc1))
        return fc2


model = Net(num_anime=num_anime, num_users=num_users,
            user_embedding_size=50, anime_embedding_size=50)
model.to(device)
criterion = nn.MSELoss()
print(model)

Net(
  (emb_user): Embedding(108711, 50)
  (emb_anime): Embedding(6668, 50)
  (ln1): LayerNorm(torch.Size([100]), eps=1e-05, elementwise_affine=True)
  (drop1): Dropout(p=0.4)
  (fc1): Linear(in_features=100, out_features=124, bias=True)
  (ln2): LayerNorm(torch.Size([124]), eps=1e-05, elementwise_affine=True)
  (drop2): Dropout(p=0.2)
  (fc2): Linear(in_features=124, out_features=1, bias=True)
)


In [None]:
def train(num_epochs, model, optimizer, criterion):
    train_loss = []
    validation_loss = []
    for epoch in range(num_epochs):
        print('Running epoch {}'.format(epoch + 1))
        train_epoch_loss = []
        validation_epoch_loss = []
        model = model.train()
        for param in model.parameters():
            param.requires_grad = True
        # Model Training
        for idx, (X, y) in enumerate(train_dataloader):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            X = X.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of anime indices.
            #record = torch.from_numpy(np.array([ 3, 23, 43, 53,  5,  4,  3, 67], dtype=np.int64)).to(device)

            prediction = model(X)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = criterion(prediction, y)
            loss.backward()
            optimizer.step()
            train_epoch_loss.append(float(loss))
            if idx % 12000 == 0:
                print('Batch {} - Training loss: {}'.format(idx + 1, loss))
            del loss
            del prediction


        with torch.no_grad():
            model = model.eval()
            for param in model.parameters():
                param.requires_grad = False
            for idx, (X, y) in enumerate(test_dataloader):
                # Step 1. Remember that Pytorch accumulates gradients.
                # We need to clear them out before each instance
                model.zero_grad()

                X = X.to(device=device)
                y = y.to(device=device)

                # Step 2. Get our inputs ready for the network, that is, turn them into
                # Tensors of anime indices.
                #record = torch.from_numpy(np.array([ 3, 23, 43, 53,  5,  4,  3, 67], dtype=np.int64)).to(device)

                prediction = model(X)

                # Step 4. Compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = criterion(prediction, y)
                validation_epoch_loss.append(float(loss))
                if idx % 3000 == 0:
                    print('Batch {} - Validation loss: {}'.format(idx + 1, loss))
                del loss
                del prediction
            model = model.train()

        train_loss.append(np.mean(train_epoch_loss))
        validation_loss.append(np.mean(validation_epoch_loss))
        print('Epoch {}: Mean training loss: {} Mean validation loss: {}'.format(epoch + 1, train_loss[-1], validation_loss[-1]))

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(num_epochs=6, optimizer=optimizer, model=model, criterion=criterion)

Running epoch 1
Batch 1 - Training loss: 38.352134704589844
Batch 12001 - Training loss: 11.515974044799805
Batch 24001 - Training loss: 10.807840347290039
Batch 1 - Validation loss: 10.270849227905273
Batch 3001 - Validation loss: 10.612107276916504
Batch 6001 - Validation loss: 10.6364107131958
Epoch 1: Mean training loss: 11.810130416417804 Mean validation loss: 10.314588454678335
Running epoch 2
Batch 1 - Training loss: 10.771699905395508
Batch 12001 - Training loss: 10.993139266967773


In [None]:
torch.save(model.state_dict, 'pytorch_nn_epoch6_embedding_fix.pt')

In [None]:
# model.load_state_dict(torch.load('pytorch_nn_epoch4_embedding_fix_10.23-10.21.pt')())

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
train(num_epochs=10, optimizer=optimizer, model=model, criterion=criterion)

In [None]:
# torch.save(model.state_dict, 'pytorch_nn_epoch25_embedding_fix.pt')

In [None]:
# torch.save(model.state_dict, 'pytorch_nn_9.885-9.8625.pt')

In [None]:
params = model.named_parameters()
user_embeddings = next(params)[1]
anime_embeddings = next(params)[1]
user_embeddings.shape, anime_embeddings.shape

In [None]:
# pickle.dump(user_embeddings, open('user_embed_pytorch_nn_epoch4_embedding_fix_10.23-10.21.pkl', 'wb'))

In [None]:
# pickle.dump(anime_embeddings, open('anime_embed_pytorch_nn_epoch4_embedding_fix_10.23-10.21.pkl', 'wb'))

In [None]:
mapping['idx2user'] = {v:k for k, v in mapping['user2idx'].items()}

In [None]:
sorted_user_embeddings = sorted(
    [(mapping['idx2user'][idx], np.linalg.norm(x)) for idx, x in enumerate(user_embeddings.to('cpu').detach().numpy())], key=lambda x: x[1])
("top min", sorted_user_embeddings[:10]), ("top max", sorted_user_embeddings[-10:])

In [None]:
sorted_anime_embeddings = sorted(
    [(idx, np.linalg.norm(x)) for idx, x in enumerate(anime_embeddings.to('cpu').detach().numpy())], key=lambda x: x[1])
("top min", sorted_anime_embeddings[:10]), ("top max", sorted_anime_embeddings[-10:])