In [112]:
%load_ext autoreload
%autoreload 2

import recommender_utils
import math
import copy

import model_CNN as model_cnn
import torch
import numpy as np
from pandas import concat


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [79]:
movie_df, movie_feature_headers, num_feature_headers = recommender_utils.get_movies_data(filepath='data_small/movies.csv', separator=r',', movies_columns_to_drop=['genres'], only_genres=False)

data_train = recommender_utils.get_ratings_data(filepath='data_small/train.csv', separator=r',', dtypes=recommender_utils.dtypes)
data_val = recommender_utils.get_ratings_data(filepath='data_small/validate.csv', separator=r',', dtypes=recommender_utils.dtypes)
data_test = recommender_utils.get_ratings_data(filepath='data_small/test.csv', separator=r',', dtypes=recommender_utils.dtypes)

user_to_index = recommender_utils.map_to_new_indexes(concat([data_train, data_val, data_test]), column='user')
movie_to_index = recommender_utils.map_to_new_indexes(concat([data_train, data_val, data_test]), column='item')

(num_users_train, num_movies_train), (X_train, y_train) = recommender_utils.create_dataset_cnn(data_train, user_to_index, movie_to_index)
(num_users_val, num_movies_val), (X_val, y_val) = recommender_utils.create_dataset_cnn(data_val, user_to_index, movie_to_index)
(num_users_test, num_movies_test), (X_test, y_test) = recommender_utils.create_dataset_cnn(data_test, user_to_index, movie_to_index)


print("Dataset treningowy:")
print(f"\tEmbeddings: {num_users_train} uzytkowników, {num_movies_train} filmów")
print(f"\tX wymiar: {X_train.shape}")
print(f"\tY shape: {y_train.shape}")
print(X_train.user_id.max())
print(X_train.movie_id.max())

print("Dataset walidacyjny:")
print(f"\tEmbeddings: {num_users_val} uzytkowników, {num_movies_val} filmów")
print(f"\tX wymiar: {X_val.shape}")
print(f"\tY shape: {y_val.shape}")
print(X_val.user_id.max())
print(X_val.movie_id.max())

print("Dataset testowy:")
print(f"\tEmbeddings: {num_users_test} uzytkowników, {num_movies_test} filmów")
print(f"\tX wymiar: {X_test.shape}")
print(f"\tY shape: {y_test.shape}")
print(X_test.user_id.max())
print(X_test.movie_id.max())

datasets = {'train': (X_train, y_train), 'val': (X_val, y_val)}
dataset_sizes = {'train': len(X_train), 'val': len(X_val)}

minmax = y_train.min().astype(float), y_train.max().astype(float)

num_users_all = num_users_train + num_users_val + num_users_test
num_movies_all = num_movies_train + num_movies_val + num_movies_test


Dataset treningowy:
	Embeddings: 427 uzytkowników, 8500 filmów
	X wymiar: (68495, 2)
	Y shape: (68495,)
426
8499
Dataset walidacyjny:
	Embeddings: 61 uzytkowników, 5536 filmów
	X wymiar: (17280, 2)
	Y shape: (17280,)
487
9448
Dataset testowy:
	Embeddings: 122 uzytkowników, 3975 filmów
	X wymiar: (15061, 2)
	Y shape: (15061,)
609
9723


In [228]:
model = model_cnn.ConvEmbeddingNet(
    n_users=num_users_all, n_movies=num_movies_all, 
    n_factors=1000, hidden=[500, 500, 500], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5, 0.25])

In [203]:
model

ConvEmbeddingNet(
  (u): Embedding(610, 1000)
  (m): Embedding(18011, 1000)
  (drop): Dropout(p=0.05, inplace=False)
  (hidden): Sequential(
    (0): Conv1d(2000, 500, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    (7): ReLU()
    (8): Dropout(p=0.25, inplace=False)
  )
  (fc): Linear(in_features=500, out_features=1, bias=True)
)

In [229]:
RANDOM_STATE = 1
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
lr = 1e-5
wd = 1e-4
bs = 2000
n_epochs = 1000
patience = 10
no_improvements = 0
best_loss = np.inf
best_weights = None
use_scheduler = False
history = []
lr_history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model.to(device)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))
if use_scheduler:
    scheduler = model_cnn.CyclicLR(optimizer, model_cnn.cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))

for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        training = phase == 'train'
        running_loss = 0.0
        n_batches = 0
        
        for batch in model_cnn.batches(*datasets[phase], shuffle=training, bs=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()
        
            # compute gradients only during 'train' phase
            with torch.set_grad_enabled(training):
                outputs = model(x_batch[:, 0], x_batch[:, 1], minmax)
                loss = criterion(outputs, y_batch)
                
                # don't update weights and rates when in 'val' phase
                if training:
                    if use_scheduler:
                        scheduler.step()
                    loss.backward()
                    optimizer.step()
                    if use_scheduler:
                        lr_history.extend(scheduler.get_lr())
                    
            running_loss += loss.item()
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        # early stopping: save weights of the best model so far
        if phase == 'val':
            if epoch_loss < best_loss:
                print('loss improvement on epoch: %d' % (epoch + 1))
                best_loss = epoch_loss
                best_weights = copy.deepcopy(model.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break

loss improvement on epoch: 1
[001/1000] train: 1.5817 - val: 1.2020
loss improvement on epoch: 2
[002/1000] train: 1.3583 - val: 0.9620
loss improvement on epoch: 3
[003/1000] train: 1.1427 - val: 0.8972
loss improvement on epoch: 4
[004/1000] train: 1.1181 - val: 0.8943
[005/1000] train: 1.1173 - val: 0.8949
[006/1000] train: 1.1183 - val: 0.8947
loss improvement on epoch: 7
[007/1000] train: 1.1178 - val: 0.8925
loss improvement on epoch: 8
[008/1000] train: 1.1180 - val: 0.8921
[009/1000] train: 1.1173 - val: 0.8927
loss improvement on epoch: 10
[010/1000] train: 1.1177 - val: 0.8914
[011/1000] train: 1.1174 - val: 0.8915
loss improvement on epoch: 12
[012/1000] train: 1.1172 - val: 0.8907
loss improvement on epoch: 13
[013/1000] train: 1.1183 - val: 0.8891
[014/1000] train: 1.1182 - val: 0.8908
[015/1000] train: 1.1186 - val: 0.8902
[016/1000] train: 1.1180 - val: 0.8896
loss improvement on epoch: 17
[017/1000] train: 1.1169 - val: 0.8884
[018/1000] train: 1.1174 - val: 0.8887
loss

In [205]:
model

ConvEmbeddingNet(
  (u): Embedding(610, 1000)
  (m): Embedding(18011, 1000)
  (drop): Dropout(p=0.05, inplace=False)
  (hidden): Sequential(
    (0): Conv1d(2000, 500, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    (7): ReLU()
    (8): Dropout(p=0.25, inplace=False)
  )
  (fc): Linear(in_features=500, out_features=1, bias=True)
)

In [226]:
import csv
name = "cnn_100epochs_wo_features_100b_50f_5ker_2pad_wd1e-4_lr1e-5"
torch.save(best_weights, f"models/mgr/{name}.pt")
header = ['epoch', 'total', 'train', 'val']
rows = []

with open(f'models/mgr/{name}.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    writer.writerows(history)

In [62]:
print(history)
print(stats)

[{'epoch': 1, 'total': 100, 'train': 1.6522281200348508, 'val': 1.4264369673199124}, {'epoch': 2, 'total': 100, 'train': 1.6346701385522553, 'val': 1.4094229998411956}, {'epoch': 3, 'total': 100, 'train': 1.615244218417089, 'val': 1.3908839631963659}, {'epoch': 4, 'total': 100, 'train': 1.5942582345198555, 'val': 1.3702423713825367}, {'epoch': 5, 'total': 100, 'train': 1.5701569643862292, 'val': 1.3472909609476724}, {'epoch': 6, 'total': 100, 'train': 1.543974967949303, 'val': 1.320873851246304}, {'epoch': 7, 'total': 100, 'train': 1.5132552462343283, 'val': 1.2922321725774695}, {'epoch': 8, 'total': 100, 'train': 1.48048256010076, 'val': 1.2610000389593619}, {'epoch': 9, 'total': 100, 'train': 1.444400892557737, 'val': 1.228015998557762}, {'epoch': 10, 'total': 100, 'train': 1.40622381667985, 'val': 1.1942142371778135}, {'epoch': 11, 'total': 100, 'train': 1.3691254346020352, 'val': 1.1604599802582352}, {'epoch': 12, 'total': 100, 'train': 1.3303933357825217, 'val': 1.128865404482241}

In [94]:
running_loss = 0.0
n_batches = 0

for batch in model_cnn.batches(X_test, y_test, shuffle=False, bs=bs):
    x_batch, y_batch = [b.to(device) for b in batch]
    with torch.set_grad_enabled(False):
        outputs = model(x_batch[:, 0], x_batch[:, 1], minmax)
        loss = criterion(outputs, y_batch)
    running_loss += loss.item()
    
test_loss = running_loss / len(X_test)
print(f"Test loss: {test_loss}")

Test loss: 1.0153876271003128
