In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [2]:
def map_data(data):
    """
    From IGMC data_utils.py
    Map data to proper indices in case they are not in a continues [0, N) range

    Parameters
    ----------
    data : np.int32 arrays

    Returns
    -------
    mapped_data : np.int32 arrays
    n : length of mapped_data

    """
    uniq = list(set(data))

    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array([id_dict[x] for x in data])
    n = len(uniq)

    return data, id_dict, n

In [3]:
def get_movies_data(filepath=None, separator=None, movies_columns_to_drop=None):
    movie_headers = ['item', 'title', 'genres', 'mean', 'popularity', 'mean_unbiased', 
                    '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
                    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                    'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
                    'War', 'Western']

    movie_df = pd.read_csv(filepath, sep=separator, header=None,
                           names=movie_headers, engine='python', encoding='ISO-8859-1', skiprows=1)
    for column_to_drop in movies_columns_to_drop:
        if column_to_drop in movie_df.columns:
            movie_df.drop(column_to_drop, axis=1, inplace=True)
    genre_headers = movie_df.columns.values[7:]
    num_genres = genre_headers.shape[0]
    return movie_df, genre_headers, num_genres

In [4]:
def get_ratings_data(filepath=None, separator=None, dtypes=None):
    return pd.read_csv(
        filepath, sep=separator, header=None,
        names=["user", "item", "rating", "timestamp"], dtype=dtypes, skiprows=1)

In [35]:
def preprocess_data_to_graph(data_array, testing=False, rating_map=None, post_rating_map=None, ratio=1.0, dtypes=None, class_values=None):
    """
    Loads official train/test split and uses 10% of training samples for validaiton
    For each split computes 1-of-num_classes labels. Also computes training
    adjacency matrix. Assumes flattening happens everywhere in row-major fashion.
    """
    if ratio < 1.0:
        data_array = data_array[data_array[:, -1].argsort()[:int(ratio*len(data_array))]]

    user_nodes_ratings = data_array[:, 0].astype(dtypes['user'])
    item_nodes_ratings = data_array[:, 1].astype(dtypes['item'])
    ratings = data_array[:, 2].astype(dtypes['rating'])
    if rating_map is not None:
        for i, x in enumerate(ratings):
            ratings[i] = rating_map[x]

    user_nodes_ratings, user_dict, num_users = map_data(user_nodes_ratings)
    item_nodes_ratings, item_dict, num_items = map_data(item_nodes_ratings)

    user_nodes_ratings, item_nodes_ratings, ratings = user_nodes_ratings.astype(np.int64), item_nodes_ratings.astype(np.int32), ratings.astype(np.float64)

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {r: i for i, r in enumerate(class_values.tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[user_nodes_ratings, item_nodes_ratings] = np.array([rating_dict[r] for r in ratings])

    for i in range(len(user_nodes_ratings)):
        assert(labels[user_nodes_ratings[i], item_nodes_ratings[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_edges = data_array.shape[0]

    pairs_nonzero = np.array([[u, v] for u, v in zip(user_nodes_ratings, item_nodes_ratings)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    for i in range(len(ratings)):
        assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    assert(len(idx_nonzero) == num_edges)

    user_idx, item_idx = pairs_nonzero.transpose()

    # create labels
    nonzero_labels = labels[idx_nonzero]

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    if post_rating_map is None:
        rating_mx_train[idx_nonzero] = labels[idx_nonzero].astype(np.float32) + 1.
    else:
        rating_mx_train[idx_nonzero] = np.array([post_rating_map[r] for r in class_values[labels[idx_nonzero]]]) + 1.
    rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

    return rating_mx_train, nonzero_labels, user_idx, item_idx, item_dict

In [7]:
def get_movies_features(movie_df, item_dict, genre_headers, num_genres):
    item_features = np.zeros((len(item_dict.keys()), num_genres), dtype=np.float32)
    for movie_id, g_vec in zip(movie_df['item'].values.tolist(), movie_df[genre_headers].values.tolist()):
        # check if movie_id was listed in ratings file and therefore in mapping dictionary
        if movie_id in item_dict.keys():
            item_features[item_dict[movie_id], :] = g_vec
    return item_features

In [36]:

movie_df, genre_headers, num_genres = get_movies_data(filepath='data/movies.csv', separator=r',', movies_columns_to_drop=['genres'])

dtypes = {
    'user': np.int32, 'item': np.int32,
    'rating': np.float32, 'timestamp': np.float64}

data_train = get_ratings_data(filepath='data/train.csv', separator=r',', dtypes=dtypes)
data_val = get_ratings_data(filepath='data/validate.csv', separator=r',', dtypes=dtypes)
data_test = get_ratings_data(filepath='data/test.csv', separator=r',', dtypes=dtypes)

data_array_train = np.array(data_train.values.tolist())
data_array_val = np.array(data_val.values.tolist())
data_array_test = np.array(data_test.values.tolist())

class_values = np.array([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5], dtype=dtypes['rating'])

train_adjacency_mx, train_labels, train_user_idx, train_item_idx, train_item_dict = preprocess_data_to_graph(data_array_train, dtypes=dtypes, class_values=class_values)
train_item_features = sp.csr_matrix(get_movies_features(movie_df, train_item_dict, genre_headers, num_genres))
val_adjacency_mx, val_labels, val_user_idx, val_item_idx, val_item_dict = preprocess_data_to_graph(data_array_val, dtypes=dtypes, class_values=class_values)
val_item_features = sp.csr_matrix(get_movies_features(movie_df, val_item_dict, genre_headers, num_genres))
test_adjacency_mx, test_labels, test_user_idx, test_item_idx, test_item_dict = preprocess_data_to_graph(data_array_test, dtypes=dtypes, class_values=class_values)
test_item_features = sp.csr_matrix(get_movies_features(movie_df, test_item_dict, genre_headers, num_genres))

print("Train item features shape: "+str(train_item_features.shape))
print("Validation item features shape: "+str(val_item_features.shape))
print("Test item features shape: "+str(test_item_features.shape))


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x11cacd790>
Traceback (most recent call last):
  File "/Users/user/Documents/Nauka/Studia/Magisterskie/Magisterka/Recommender/.venv/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1466, in __del__
    self._shutdown_workers()
  File "/Users/user/Documents/Nauka/Studia/Magisterskie/Magisterka/Recommender/.venv/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1424, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python

Train item features shape: (56232, 18)
Validation item features shape: (29750, 18)
Test item features shape: (38341, 18)


In [37]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch.optim import Adam
from torch_geometric.data import DataLoader
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import dropout_adj
from IGMC.util_functions import *
from IGMC.data_utils import *
from IGMC.preprocessing import *

In [38]:
train_dataset = eval('MyDynamicDataset')(root='data_test/processed/train', A=train_adjacency_mx, 
    links=(train_user_idx, train_item_idx), labels=train_labels, h=1, sample_ratio=1.0, 
    max_nodes_per_hop=200, u_features=None, v_features=train_item_features, class_values=class_values)
test_dataset = eval('MyDataset')(root='data_test/processed/test', A=test_adjacency_mx, 
    links=(test_user_idx, test_item_idx), labels=test_labels, h=1, sample_ratio=1.0, 
    max_nodes_per_hop=200, u_features=None, v_features=test_item_features, class_values=class_values)

In [15]:
import torch
import math
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import dropout_adj

class IGMC(torch.nn.Module):
    def __init__(self):
        super(IGMC, self).__init__()
        self.rel_graph_convs = torch.nn.ModuleList()
        self.rel_graph_convs.append(RGCNConv(in_channels=4, out_channels=32,\
                                             num_relations=5, num_bases=4))
        self.rel_graph_convs.append(RGCNConv(in_channels=32, out_channels=32\
                                             , num_relations=5, num_bases=4))
        self.rel_graph_convs.append(RGCNConv(in_channels=32, out_channels=32,\
                                             num_relations=5, num_bases=4))
        self.rel_graph_convs.append(RGCNConv(in_channels=32, out_channels=32,\
                                             num_relations=5, num_bases=4))
        self.linear_layer1 = Linear(256, 128)
        self.linear_layer2 = Linear(128, 1)

    def reset_parameters(self):
        self.linear_layer1.reset_parameters()
        self.linear_layer2.reset_parameters()
        for i in self.rel_graph_convs:
            i.reset_parameters()

    def forward(self, data):
        num_nodes = len(data.x)
        edge_index_dr, edge_type_dr = dropout_adj(data.edge_index, data.edge_type,\
                                p=0.2, num_nodes=num_nodes, training=self.training)

        out = data.x
        h = []
        for conv in self.rel_graph_convs:
            out = conv(out, edge_index_dr, edge_type_dr)
            out = torch.tanh(out)
            h.append(out)
        h = torch.cat(h, 1)
        h = [h[data.x[:, 0] == True], h[data.x[:, 1] == True]]
        g = torch.cat(h, 1)
        out = self.linear_layer1(g)
        out = F.relu(out)
        out = F.dropout(out, p=0.5, training=self.training)
        out = self.linear_layer2(out)
        out = out[:,0]
        return out

model = IGMC()


In [None]:
training_ratings_graph.loc[:5]

In [39]:
LR = 1e-3
EPOCHS = 5
BATCH_SIZE = 50
LR_DECAY_STEP = 20
LR_DECAY_VALUE = 10

train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, num_workers=2)

device = torch.device("cpu")
model.to(device)
model.reset_parameters()
optimizer = Adam(model.parameters(), lr=LR, weight_decay=0)



In [None]:
# 1 epoka by trwala 43,5h na CPU
loss_through_epochs = []
batches_per_epoch = len(train_loader)
for epoch in range(1, EPOCHS+1):
    model.train()
    train_loss_all = 0
    for i, train_batch in enumerate(train_loader):
        print(f"{i}/{batches_per_epoch}")
        optimizer.zero_grad()
        train_batch = train_batch.to(device)
        y_pred = model(train_batch)
        y_true = train_batch.y
        train_loss = F.mse_loss(y_pred, y_true)
        train_loss.backward()
        train_loss_all += BATCH_SIZE * float(train_loss)
        optimizer.step()
        torch.cuda.empty_cache()
    train_loss_all = train_loss_all / len(train_loader.dataset)
    loss_through_epochs.append(train_loss_all)
    print('epoch', epoch,'; train loss', train_loss_all)

    if epoch % LR_DECAY_STEP == 0:
      for param_group in optimizer.param_groups:
          param_group['lr'] = param_group['lr'] / LR_DECAY_VALUE

In [41]:
a = [1.5614034219933284, 1.2698813516058196, 1.2424809017183027, 1.2205315930388208, 1.2079426253976078]
print("Loss from first 5 epochs on 1/100th the dataset and without the feature matrix\n", a)

Loss from first 5 epochs on 1/100th the dataset and without the feature matrix
 [1.5614034219933284, 1.2698813516058196, 1.2424809017183027, 1.2205315930388208, 1.2079426253976078]


In [None]:
batches_in_eval = len(test_loader)
model.eval()
test_loss = 0
for i, test_batch in enumerate(test_loader):
    print(f"{i}/{batches_in_eval}")
    test_batch = test_batch.to(device)
    with torch.no_grad():
        y_pred = model(test_batch)
    y_true = test_batch.y
    test_loss += F.mse_loss(y_pred, y_true, reduction='sum')
    # torch.cuda.empty_cache()
mse_loss = float(test_loss) / len(test_loader.dataset)

print('test MSE loss', mse_loss)
print('test RMSE loss', math.sqrt(mse_loss))