In [1]:
import sqlite3
import pickle
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from glob import glob
from tqdm import tqdm
from IPython.core.debugger import set_trace
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split

In [2]:
torch.cuda.get_device_name(torch.cuda.current_device())

'GeForce GTX 960M'

In [3]:
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() 
                                  else "cpu")

In [4]:
num_users, num_anime = (108709, 6668)
batch_size = 1024

In [5]:
user_grouped_rating_files = [f for f in glob('datasets/user_grouped_ratings/augmented_10_user_grouped_ratings_processed_*.db')]
user_grouped_rating_files.sort()

In [6]:
class AnimeRatingsDataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, sqlite_file, transform=None):
        self.db = sqlite3.connect(sqlite_file)
        self.cursor = self.db.cursor()
        self.length = self.cursor.execute('SELECT count(blob) from augmented_data;').fetchone()[0]

    def extract_required_format(self, record):
        record_df = pd.DataFrame({'anime_id': record['anime_id'], 'my_score': record['my_score']})
        if len(record_df) > 5:
            #num_of_seq = np.random.randint(5, len(record_df))
            num_of_seq = 6
            indexes = np.random.choice(record_df.index, size=num_of_seq)
        else:
            num_of_seq = np.random.randint(2, len(record_df))
            indexes = np.random.choice(record_df.index, size=num_of_seq)
        train = record_df.iloc[indexes[:-1]]
        predict = record_df.iloc[indexes[-1:]]
        X = np.concatenate([
            [num_of_seq - 1],
            train['anime_id'].values,
            train['my_score'].values,
            predict['anime_id'].values
        ])
        y = predict['my_score'].values
        return X, y

    def __getitem__(self, index):
        if isinstance(index, torch.Tensor):
            index = int(index)
        row = self.cursor.execute('SELECT * from augmented_data where rowid=?', (index + 1, )).fetchone()
        return self.extract_required_format(pickle.loads(row[1]))

    def __len__(self):
        return self.length

In [7]:
total_dataset = ConcatDataset([AnimeRatingsDataset(f) for f in user_grouped_rating_files[:2]])

train_size = int(len(total_dataset) * 0.8)
test_size = int(len(total_dataset) * 0.2)
total = sum([train_size, test_size])
diff = len(total_dataset) - total
train_dataset, test_dataset = random_split(total_dataset, (train_size + diff, test_size))

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=0
)

test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=0
)

In [8]:
len(train_dataloader), len(test_dataloader)

(152, 38)

In [9]:
class Net(nn.Module):

    def __init__(self, anime_embedding_vocab, anime_embedding_dim,
                 num_past_animes=5, batch_size=batch_size):
        super(Net, self).__init__()
        
        # Store all the constants.
        self.anime_embedding_vocab = anime_embedding_vocab
        self.anime_embedding_dim = anime_embedding_dim
        self.num_past_animes = num_past_animes
        self.batch_size = batch_size

        self.past_anime_embedding = nn.Embedding(anime_embedding_vocab, anime_embedding_dim)
        self.embedding_drop = nn.Dropout(0.2)

        self.cv1 = nn.Conv1d(num_past_animes * anime_embedding_dim, 64, kernel_size=1)
        self.cv2 = nn.Conv1d(64, 128, kernel_size=1)
        self.drop1 = nn.Dropout(0.2)
        self.mp1 = nn.MaxPool1d(num_past_animes)
        
        # Previous history after max pooling, previous ratings, new history
        self.fc1 = nn.Linear(128 + 50 + num_past_animes, 1)

    def forward(self, x):
        current_batch_size = x.shape[0]
        num_past_records = 5
        past_anime_historical_ids = x[:, 1: num_past_records + 1]
        past_anime_ratings = x[:, num_past_records + 1:-1]
        future_anime_id = x[:, -1:]

        past_embeddings = self.embedding_drop(model.past_anime_embedding(past_anime_historical_ids))
        future_embeddings = self.embedding_drop(model.past_anime_embedding(future_anime_id))

        cv1 = self.cv1(past_embeddings.view(current_batch_size, -1, 1))
        cv2 = self.drop1(model.cv2(cv1))

        mp1 = self.mp1(cv2)

        fc_in = torch.cat([
            mp1.permute(1, 2, 0),
            future_embeddings.view(current_batch_size, -1, 1).permute(1, 2, 0),
            past_anime_ratings.view(current_batch_size, 5, 1).float().permute(1, 2, 0)
        ]).permute(2, 0, 1).view(current_batch_size, -1)

        return self.fc1(fc_in)


model = Net(anime_embedding_dim=50, anime_embedding_vocab=num_anime)
model.to(device)
print(model)

Net(
  (past_anime_embedding): Embedding(6668, 50)
  (embedding_drop): Dropout(p=0.2)
  (cv1): Conv1d(250, 64, kernel_size=(1,), stride=(1,))
  (cv2): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
  (drop1): Dropout(p=0.2)
  (mp1): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=183, out_features=1, bias=True)
)


In [18]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 3

In [None]:
train_loss = []
validation_loss = []
for epoch in range(num_epochs):
    print('Running epoch {}'.format(epoch + 1))
    train_epoch_loss = []
    validation_epoch_loss = []
    model = model.train()
    for param in model.parameters():
        param.requires_grad = True
    # Model Training
    for idx, (X, y) in enumerate(train_dataloader):
        current_batch_size = X.shape[0]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        X = X.to(device)
        y = y.to(device)
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of anime indices.
        #record = torch.from_numpy(np.array([ 3, 23, 43, 53,  5,  4,  3, 67], dtype=np.int64)).to(device)

        prediction = model(X)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = criterion(prediction, y.to(device).float())
        loss.backward()
        optimizer.step()
        train_epoch_loss.append(float(loss))
        if idx % 200 == 0:
            print('Batch {} - Training loss: {}'.format(idx + 1, loss))
        
        
    with torch.no_grad():
        model = model.eval()
        for param in model.parameters():
            param.requires_grad = False
        for idx, (X, y) in enumerate(test_dataloader):
            current_batch_size = X.shape[0]
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            X = X.to(device)
            y = y.to(device)

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of anime indices.
            #record = torch.from_numpy(np.array([ 3, 23, 43, 53,  5,  4,  3, 67], dtype=np.int64)).to(device)

            prediction = model(X)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = criterion(prediction, y.to(device).float())
            validation_epoch_loss.append(float(loss))
            if idx % 200 == 0:
                print('Batch {} - Validation loss: {}'.format(idx + 1, loss))
        model = model.train()

    train_loss.append(np.mean(train_epoch_loss))
    validation_loss.append(np.mean(validation_epoch_loss))
    print('Epoch {}: Mean training loss: {} Mean validation loss: {}'.format(epoch + 1, train_loss[-1], validation_loss[-1]))

Running epoch 1
Batch 1 - Training loss: 9.66856575012207
Batch 1 - Validation loss: 9.940427780151367
Epoch 1: Mean training loss: 10.123922040587978 Mean validation loss: 10.070046324478952
Running epoch 2
Batch 1 - Training loss: 10.210956573486328
Batch 1 - Validation loss: 9.966619491577148
Epoch 2: Mean training loss: 10.121461378900628 Mean validation loss: 10.101415383188348
Running epoch 3
Batch 1 - Training loss: 10.469938278198242


In [20]:
torch.save(model.state_dict, 'CNN.pth')

  "type " + obj.__name__ + ". It won't be checked "
