In [103]:
%load_ext autoreload
%autoreload 2

import recommender_utils
import numpy as np
import scipy.sparse as sp
import pandas as pd
import copy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
movie_df, movie_feature_headers, num_feature_headers = recommender_utils.get_movies_data(filepath='data_small/movies.csv', separator=r',', movies_columns_to_drop=['genres'], genres=True, other_features=[3,4])

data_train = recommender_utils.get_ratings_data(filepath='data_small/train.csv', separator=r',', dtypes=recommender_utils.dtypes)
data_val = recommender_utils.get_ratings_data(filepath='data_small/validate.csv', separator=r',', dtypes=recommender_utils.dtypes)
data_test = recommender_utils.get_ratings_data(filepath='data_small/test.csv', separator=r',', dtypes=recommender_utils.dtypes)

data_array_train = np.array(data_train.values.tolist())
data_array_val = np.array(data_val.values.tolist())
data_array_test = np.array(data_test.values.tolist())

# add features for whole dataset TODO
train_adjacency_mx, train_labels, train_user_idx, train_item_idx, train_item_dict = recommender_utils.preprocess_data_to_graph(data_array_train, dtypes=recommender_utils.dtypes, class_values=recommender_utils.class_values)
train_item_features = sp.csr_matrix(recommender_utils.get_movies_features(movie_df, train_item_dict, movie_feature_headers, num_feature_headers))
train_user_features = sp.csr_matrix(recommender_utils.get_user_features(train_user_idx))
val_adjacency_mx, val_labels, val_user_idx, val_item_idx, val_item_dict = recommender_utils.preprocess_data_to_graph(data_array_val, dtypes=recommender_utils.dtypes, class_values=recommender_utils.class_values)
val_item_features = sp.csr_matrix(recommender_utils.get_movies_features(movie_df, val_item_dict, movie_feature_headers, num_feature_headers))
val_user_features = sp.csr_matrix(recommender_utils.get_user_features(val_user_idx))
test_adjacency_mx, test_labels, test_user_idx, test_item_idx, test_item_dict = recommender_utils.preprocess_data_to_graph(data_array_test, dtypes=recommender_utils.dtypes, class_values=recommender_utils.class_values)
test_item_features = sp.csr_matrix(recommender_utils.get_movies_features(movie_df, test_item_dict, movie_feature_headers, num_feature_headers))
test_user_features = sp.csr_matrix(recommender_utils.get_user_features(test_user_idx))

print("Train item features shape: "+str(train_item_features.shape))
print("Validation item features shape: "+str(val_item_features.shape))
print("Test item features shape: "+str(test_item_features.shape))


Train item features shape: (8500, 22)
Validation item features shape: (5536, 22)
Test item features shape: (3975, 22)


In [94]:
movie_feature_headers



array(['popularity', 'mean_unbiased', '(no genres listed)', 'Action',
       'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'], dtype=object)

In [95]:
%autoreload 2
import torch
import math
from torch.optim import Adam
import torch.nn.functional as F

import model_GNN as model_gnn
from IGMC.util_functions import *
from torch_geometric.loader import DataLoader


In [96]:
use_features = True
train_item_features_array = train_item_features.toarray() if use_features else None
train_user_features_array = train_user_features.toarray() if use_features else None
test_item_features_array = test_item_features.toarray() if use_features else None
test_user_features_array = test_user_features.toarray() if use_features else None
val_item_features_array = val_item_features.toarray() if use_features else None
val_user_features_array = val_user_features.toarray() if use_features else None

train_dataset = MyDynamicDataset(root='data_test/processed/train', A=train_adjacency_mx, 
    links=(train_user_idx, train_item_idx), labels=train_labels, h=1, sample_ratio=1.0, 
    max_nodes_per_hop=200, u_features=train_user_features_array, v_features=train_item_features_array, class_values=recommender_utils.class_values)
test_dataset = MyDynamicDataset(root='data_test/processed/test', A=test_adjacency_mx, 
    links=(test_user_idx, test_item_idx), labels=test_labels, h=1, sample_ratio=1.0, 
    max_nodes_per_hop=200, u_features=test_user_features_array, v_features=test_item_features_array, class_values=recommender_utils.class_values)
val_dataset = MyDynamicDataset(root='data_test/processed/val', A=val_adjacency_mx, 
    links=(val_user_idx, val_item_idx), labels=val_labels, h=1, sample_ratio=1.0, 
    max_nodes_per_hop=200, u_features=val_user_features_array, v_features=val_item_features_array, class_values=recommender_utils.class_values)

In [97]:
LR = 1e-3
EPOCHS = 80
BATCH_SIZE = 50
LR_DECAY_STEP = 20
LR_DECAY_VALUE = 10

train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, num_workers=2)

device = torch.device("cpu")
if use_features:
    model = model_gnn.IGMC(side_features=True, n_side_features=train_item_features_array.shape[1])
else:
    model = model_gnn.IGMC()
model.to(device)
model.reset_parameters()
optimizer = Adam(model.parameters(), lr=LR, weight_decay=0)

In [98]:
history = []
batches_per_epoch_train = len(train_loader)
batches_per_epoch_val = len(val_loader)
best_weights = None
best_loss = np.inf
patience = 5
no_improvements = 0

for epoch in range(1, EPOCHS+1):
    stats = {'epoch': epoch, 'total': EPOCHS}
    model.train()
    train_loss_all = 0
    for i, train_batch in enumerate(train_loader):
        if i % 100 == 0 or i % batches_per_epoch_train == 0:
            print(f"{i}/{batches_per_epoch_train}")
        optimizer.zero_grad()
        train_batch = train_batch.to(device)
        y_pred = model(train_batch)
        y_true = train_batch.y
        train_loss = F.mse_loss(y_pred, y_true)
        train_loss.backward()
        train_loss_all += BATCH_SIZE * float(train_loss)
        optimizer.step()
    train_loss_all = train_loss_all / len(train_loader.dataset)
    stats["train"] = train_loss_all
    print('epoch', epoch,'; train loss', train_loss_all)

    val_loss_all = 0
    for i, val_batch in enumerate(val_loader):
        if i % 100 == 0 or i % batches_per_epoch_val == 0:
            print(f"{i}/{batches_per_epoch_val}")
        optimizer.zero_grad()
        val_batch = val_batch.to(device)
        y_pred = model(val_batch)
        y_true = val_batch.y
        val_loss = F.mse_loss(y_pred, y_true)
        val_loss_all += BATCH_SIZE * float(val_loss)
    val_loss_all = val_loss_all / len(val_loader.dataset)
    stats["val"] = val_loss_all
    print('epoch', epoch,'; val loss', val_loss_all)

    history.append(stats)
    if epoch % LR_DECAY_STEP == 0:
      for param_group in optimizer.param_groups:
          param_group['lr'] = param_group['lr'] / LR_DECAY_VALUE
    
    if val_loss_all < best_loss:
        print('loss improvement on epoch: %d' % (epoch))
        best_loss = val_loss_all
        best_weights = copy.deepcopy(model.state_dict())
        no_improvements = 0
    else:
        no_improvements += 1
    
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break

0/1370




100/1370
200/1370
300/1370
400/1370
500/1370
600/1370
700/1370
800/1370
900/1370
1000/1370
1100/1370
1200/1370
1300/1370
epoch 1 ; train loss 1.0956418637820722
0/346
100/346
200/346
300/346
epoch 1 ; val loss 0.8760156162531564
loss improvement on epoch: 1
0/1370
100/1370
200/1370
300/1370
400/1370
500/1370
600/1370
700/1370
800/1370
900/1370
1000/1370
1100/1370
1200/1370
1300/1370
epoch 2 ; train loss 0.9030332199778294
0/346
100/346
200/346
300/346
epoch 2 ; val loss 0.8104765158215607
loss improvement on epoch: 2
0/1370
100/1370
200/1370
300/1370
400/1370
500/1370
600/1370
700/1370
800/1370
900/1370
1000/1370
1100/1370
1200/1370
1300/1370
epoch 3 ; train loss 0.8599454381620982
0/346
100/346
200/346
300/346
epoch 3 ; val loss 0.7670360566461804
loss improvement on epoch: 3
0/1370
100/1370
200/1370
300/1370
400/1370
500/1370
600/1370
700/1370
800/1370
900/1370
1000/1370
1100/1370
1200/1370
1300/1370
epoch 4 ; train loss 0.8306757568512462
0/346
100/346
200/346
300/346
epoch 4 ; val 

In [99]:
import csv

name = f"gnn_genres_mean_unbiased_popularity_81m_EPOCHS_{EPOCHS}_LR_{LR}_BATCH_SIZE_{BATCH_SIZE}_LR_DECAY_STEP_{LR_DECAY_STEP}_LR_DECAY_VALUE_{LR_DECAY_VALUE}"
torch.save(best_weights, f"models/mgr/{name}.pt")
header = ['epoch', 'total', 'train', 'val']
rows = []

with open(f'models/mgr/{name}.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    writer.writerows(history)

In [129]:
use_features = False
movie_df, movie_feature_headers, num_feature_headers = recommender_utils.get_movies_data(filepath='data_small/movies.csv', separator=r',', movies_columns_to_drop=['genres'], genres=True, other_features=None)
print(movie_feature_headers)
data_test = recommender_utils.get_ratings_data(filepath='data_small/test.csv', separator=r',', dtypes=recommender_utils.dtypes)
data_array_test = np.array(data_test.values.tolist())
test_adjacency_mx, test_labels, test_user_idx, test_item_idx, test_item_dict = recommender_utils.preprocess_data_to_graph(data_array_test, dtypes=recommender_utils.dtypes, class_values=recommender_utils.class_values)
test_item_features = sp.csr_matrix(recommender_utils.get_movies_features(movie_df, test_item_dict, movie_feature_headers, num_feature_headers))
test_user_features = sp.csr_matrix(recommender_utils.get_user_features(test_user_idx))
test_item_features_array = test_item_features.toarray() if use_features else None
test_user_features_array = test_user_features.toarray() if use_features else None
test_dataset = MyDynamicDataset(root='data_test/processed/test', A=test_adjacency_mx, 
    links=(test_user_idx, test_item_idx), labels=test_labels, h=1, sample_ratio=1.0, 
    max_nodes_per_hop=200, u_features=test_user_features_array, v_features=test_item_features_array, class_values=recommender_utils.class_values)
test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, num_workers=2)
pre_path="/Users/user/Documents/Nauka/Studia/Magisterskie/Magisterka/Recommender/models/mgr/"
path=f"{pre_path}gnn_no_features_90m_EPOCHS_80_LR_0.001_BATCH_SIZE_50_LR_DECAY_STEP_20_LR_DECAY_VALUE_10.pt"
if use_features:
    model = model_gnn.IGMC(side_features=True, n_side_features=test_item_features_array.shape[1])
else:
    model = model_gnn.IGMC()
model.load_state_dict(torch.load(path))
model.eval()
model

['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


RuntimeError: Error(s) in loading state_dict for IGMC:
	size mismatch for linear_layer1.weight: copying a param with shape torch.Size([128, 277]) from checkpoint, the shape in current model is torch.Size([128, 256]).

In [127]:
batches_in_eval = len(test_loader)
model.eval()
test_loss = 0
for i, test_batch in enumerate(test_loader):
    print(f"{i}/{batches_in_eval}")
    test_batch = test_batch.to(device)
    with torch.no_grad():
        y_pred = model(test_batch)
    y_true = test_batch.y
    test_loss += F.mse_loss(y_pred, y_true, reduction='sum')
    torch.cuda.empty_cache()
mse_loss = float(test_loss) / len(test_loader.dataset)

print('test MSE loss', mse_loss)
print('test RMSE loss', math.sqrt(mse_loss))

0/302
1/302
2/302
3/302




4/302
5/302
6/302
7/302
8/302
9/302
10/302
11/302
12/302
13/302
14/302
15/302
16/302
17/302
18/302
19/302
20/302
21/302
22/302
23/302
24/302
25/302
26/302
27/302
28/302
29/302
30/302
31/302
32/302
33/302
34/302
35/302
36/302
37/302
38/302
39/302
40/302
41/302
42/302
43/302
44/302
45/302
46/302
47/302
48/302
49/302
50/302
51/302
52/302
53/302
54/302
55/302
56/302
57/302
58/302
59/302
60/302
61/302
62/302
63/302
64/302
65/302
66/302
67/302
68/302
69/302
70/302
71/302
72/302
73/302
74/302
75/302
76/302
77/302
78/302
79/302
80/302
81/302
82/302
83/302
84/302
85/302
86/302
87/302
88/302
89/302
90/302
91/302
92/302
93/302
94/302
95/302
96/302
97/302
98/302
99/302
100/302
101/302
102/302
103/302
104/302
105/302
106/302
107/302
108/302
109/302
110/302
111/302
112/302
113/302
114/302
115/302
116/302
117/302
118/302
119/302
120/302
121/302
122/302
123/302
124/302
125/302
126/302
127/302
128/302
129/302
130/302
131/302
132/302
133/302
134/302
135/302
136/302
137/302
138/302
139/302
140/302
141/30