In [835]:
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import scipy.sparse as sp
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch

pd.options.mode.chained_assignment = None

%config InlineBackend.figure_format='retina'

In [836]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f347f0e2c50>

### Метрики

In [837]:
def recall_at_k(pred_items: np.array, true_items: np.array):
    assert pred_items.shape[0] == true_items.shape[0]
    recall_list = []
    for i_pred, i_true in zip(pred_items, true_items):
        try:
            hits = len(np.intersect1d(i_pred, i_true))
            recall = hits / len(i_true)
            recall_list.append(recall)
        except ZeroDivisionError:
            print(len(recall_list), i_true)
    return np.mean(recall_list)

def precision_at_k(pred_items: np.array, true_items: np.array):
    assert pred_items.shape[0] == true_items.shape[0]
    precision_list = []
    for i_pred, i_true in zip(pred_items, true_items):
        try:
            hits = len(np.intersect1d(i_pred, i_true))
            prec = hits / len(i_pred)
            precision_list.append(prec)
        except ZeroDivisionError:
            print(len(precision_list), i_true)
    return np.mean(precision_list)

def ndcg_at_k(ranks, recs, interactions):
    assert ranks.shape[0] == recs.shape[0] == interactions.shape[0]
    scores_list = []
    for u in range(interactions.shape[0]):
        score = ndcg_score(interactions[u, recs[u]].toarray(), [ranks[u]])
        scores_list.append(score)
    return np.mean(scores_list)

In [666]:
def get_k_core(df, core=20):
    df = df.clone()
    for _ in tqdm(range(50)):
        gpb = df.groupby('user_id')['rating'].count()
        if gpb.min() < core:
            print('user min = ', gpb.min())
            core_k_users = gpb[gpb >= core].index.values
            df = df[df['user_id'].isin(core_k_users)]
            gpb = df.groupby('item_id')['rating'].count()
            core_k_items = gpb[gpb >= core].index.values
            df = df[df['item_id'].isin(core_k_items)]
        else:
            return df

# Top Popular

## Amazon Books

In [856]:
K = 20
vc = amazon._df_train['item_id'].value_counts().sort_index().values

all_ranks = torch.Tensor(np.tile(vc, (amazon.n_test_users, 1)))
all_ranks[amazon.observed_interactions.nonzero()] = 0  # exclude seen items

ranks, recs = ranks.topk(20, dim=1)

In [907]:
y_true = np.array([amazon.future_interactions[i].nonzero()[1] for i in range(amazon.n_test_users)], dtype='object')

print('====== ALS ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks.numpy(), recs, amazon.future_interactions).round(5)}')

Precision = 0.02594
Recall = 0.01203
NDCG = 0.12779


## MovieLens1M

In [919]:
K = 20
vc = mov._df_train['item_id'].value_counts().sort_index().values

all_ranks = torch.Tensor(np.tile(vc, (mov.n_test_users, 1)))
all_ranks[mov.observed_interactions.nonzero()] = 0  # exclude seen items

ranks, recs = all_ranks.topk(20, dim=1)

In [924]:
y_true = np.array([mov.future_interactions[i].nonzero()[1] for i in range(mov.n_test_users)], dtype='object')

print('====== ALS ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks.numpy(), recs, mov.future_interactions).round(5)}')

Precision = 0.41393
Recall = 0.08367
NDCG = 0.72713


# ALS

In [165]:
from implicit.als import AlternatingLeastSquares



## Amazon Books

In [834]:
class AlsDataset:
    def __init__(self, path, **kwargs):
        self.path = path
        self.df = self.load_dataset(**kwargs)

    def load_dataset(self, **kwargs):
        df = pd.read_csv(self.path, **kwargs)

        df.columns = ('user_id', 'item_id', 'rating')
        df['rating'] = 1
        all_possible_connections = df['user_id'].nunique() * df['item_id'].nunique()
        print(f"Sparsity = {100 * round(1 - df.shape[0] / all_possible_connections, 5)}%")
        n_users, n_items = df[['user_id', 'item_id']].nunique()
        print(f"Users: {n_users}")
        print(f"Items: {n_items}")
        return df


    def train_test_split(self, test_ratio=0.25, observed_ratio=0.1):
        df = self.df
        all_users = self.df['user_id'].unique()
        test_size = int(len(all_users) * test_ratio)

        # Формируем выборку пользователей для теста
        test_users = np.random.choice(a=all_users, size=test_size, replace=False)
        df_train = df[~df['user_id'].isin(test_users)]
        df_test = df[df['user_id'].isin(test_users)]
        df_test = df_test[df_test['item_id'].isin(df_train['item_id'].unique())]

        # Энкодим айдишники айтемов
        encoder = LabelEncoder()
        df_train['user_id'] = encoder.fit_transform(df_train['user_id'])
        df_train['item_id'] = encoder.fit_transform(df_train['item_id'])
        df_test['item_id'] = encoder.transform(df_test['item_id'])

        # Делим интеракции новых пользователей на виденные и будущие
        test_observed, test_future = train_test_split(df_test, test_size=1-observed_ratio, random_state=42)
        common_users = np.intersect1d(test_observed['user_id'], test_future['user_id'])
        test_observed = test_observed[test_observed['user_id'].isin(common_users)]
        test_future = test_future[test_future['user_id'].isin(common_users)]
        test_observed['user_id'] = encoder.fit_transform(test_observed['user_id'])
        test_future['user_id'] = encoder.transform(test_future['user_id'])

        self._df_train = df_train
        self._test_observed = test_observed
        self._test_future = test_future
        self.n_items = df_train['item_id'].nunique()
        self.n_train_users = df_train['user_id'].nunique()
        self.n_test_users = test_observed['user_id'].nunique()

    def build_sparse_interaction_matrix(self):
        # n_train_users = self.df_train['user_id'].nunique()
        self.train_interactions = sp.csr_matrix((self._df_train['rating'], (self._df_train['user_id'], self._df_train['item_id'])))
        self.observed_interactions = sp.csr_matrix((self._test_observed['rating'],
                                                    (self._test_observed['user_id'], self._test_observed['item_id'])),
                                                   shape=(self._test_observed['user_id'].nunique(), self.n_items)
                                                   )
        self.future_interactions = sp.csr_matrix((self._test_future['rating'],
                                                  (self._test_future['user_id'], self._test_future['item_id'])),
                                                 shape=(self._test_future['user_id'].nunique(), self.n_items)
                                                 )

In [167]:
amazon = AlsDataset('amazon_20_core.csv.gz', usecols=['user_id', 'item_id', 'rating'])

Sparsity = 99.856%


user_id    35736
item_id    38121
rating         1
dtype: int64

In [168]:
amazon.train_test_split(test_ratio=0.3, observed_ratio=0.1)
amazon.build_sparse_interaction_matrix()
amazon.train_interactions, amazon.observed_interactions, amazon.future_interactions

(<25016x38121 sparse matrix of type '<class 'numpy.longlong'>'
 	with 1369925 stored elements in Compressed Sparse Row format>,
 <10252x38121 sparse matrix of type '<class 'numpy.longlong'>'
 	with 59074 stored elements in Compressed Sparse Row format>,
 <10252x38121 sparse matrix of type '<class 'numpy.longlong'>'
 	with 520114 stored elements in Compressed Sparse Row format>)

In [169]:
als = AlternatingLeastSquares(256, iterations=5, regularization=0.001,
                              calculate_training_loss=True, random_state=42, num_threads=2)

als.fit(amazon.train_interactions, show_progress=True)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




### Train

In [170]:
K = 20
recs, ranks = np.array(als.recommend(userid=np.arange(amazon.n_train_users),
                              filter_already_liked_items=False,
                              user_items=amazon.train_interactions, N=K))
y_true = np.array([amazon.train_interactions[i].nonzero()[1] for i in range(amazon.n_train_users)], dtype='object')

print('====== ALS ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks, recs, amazon.train_interactions).round(5)}')

Precision = 0.37797
Recall = 0.18027
NDCG = 0.81906


### Test

In [171]:
als.partial_fit_users(userids=np.arange(amazon.n_train_users, amazon.n_train_users + amazon.n_test_users),
                      user_items=amazon.observed_interactions)

In [172]:
K = 20
recs, ranks = np.array(als.recommend(userid=np.arange(amazon.n_train_users, amazon.n_train_users + amazon.n_test_users),
                                     filter_already_liked_items=True,
                                     user_items=amazon.observed_interactions, N=K))
y_true = np.array([amazon.future_interactions[i].nonzero()[1] for i in range(amazon.n_test_users)], dtype='object')

print('====== ALS ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks, recs, amazon.future_interactions).round(5)}')

Precision = 0.13436
Recall = 0.06331
NDCG = 0.42584


## MovieLens1M

In [173]:
mov = AlsDataset('ml-1m/ratings.dat', usecols=[0, 1, 2], header=None, sep='::', engine='python')
mov.df.nunique()

Sparsity = 95.532%


user_id    6040
item_id    3706
rating        1
dtype: int64

In [174]:
mov.train_test_split(test_ratio=0.3, observed_ratio=0.1)
mov.build_sparse_interaction_matrix()
mov.train_interactions, mov.observed_interactions, mov.future_interactions

(<4228x3673 sparse matrix of type '<class 'numpy.longlong'>'
 	with 697548 stored elements in Compressed Sparse Row format>,
 <1786x3673 sparse matrix of type '<class 'numpy.longlong'>'
 	with 30262 stored elements in Compressed Sparse Row format>,
 <1786x3673 sparse matrix of type '<class 'numpy.longlong'>'
 	with 271672 stored elements in Compressed Sparse Row format>)

In [175]:
als = AlternatingLeastSquares(32, iterations=5, regularization=0.1,
                              calculate_training_loss=True, random_state=42, num_threads=2)

als.fit(mov.train_interactions, show_progress=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




### Train

In [176]:
K = 20
recs, ranks = np.array(als.recommend(userid=np.arange(mov.n_train_users),
                              filter_already_liked_items=False,
                              user_items=mov.train_interactions, N=K))
y_true = np.array([mov.train_interactions[i].nonzero()[1] for i in range(mov.n_train_users)], dtype='object')

print('====== ALS ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks, recs, mov.train_interactions).round(5)}')

Precision = 0.75088
Recall = 0.18346
NDCG = 0.9406


### Test

In [177]:
als.partial_fit_users(userids=np.arange(mov.n_train_users, mov.n_train_users + mov.n_test_users),
                      user_items=mov.observed_interactions)

In [179]:
K = 20
recs, ranks = np.array(als.recommend(userid=np.arange(mov.n_train_users, mov.n_train_users + mov.n_test_users),
                                     filter_already_liked_items=True,
                                     user_items=mov.observed_interactions, N=K))
y_true = np.array([mov.future_interactions[i].nonzero()[1] for i in range(mov.n_test_users)], dtype='object')

print('====== ALS ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks, recs, mov.future_interactions).round(5)}')

Precision = 0.5245
Recall = 0.11568
NDCG = 0.80066


# GF-CF

In [180]:
from sparsesvd import sparsesvd

In [181]:
class GF_CF(object):
    def __init__(self, train_matrix):
        self.train_matrix = train_matrix

    def fit(self, dim=16):
        R = self.train_matrix
        rowsum = np.array(R.sum(axis=1))
        d_inv = np.power(rowsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        R_norm = d_mat @ R

        colsum = np.array(R.sum(axis=0))
        d_inv = np.power(colsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        self.d_mat_i = d_mat
        self.d_mat_i_inv = sp.diags(1 / d_inv)
        R_norm = R_norm @ d_mat
        self.R_norm = R_norm.tocsc()
        ut, s, self.vt = sparsesvd(self.R_norm, dim)

    def predict(self, new_ratings: sp.coo_matrix):
        R_norm = self.R_norm
        # new_ratings = new_ratings.todense()
        U_2 = new_ratings @ (R_norm.T @ R_norm)
        U_1 = new_ratings @  (self.d_mat_i @ self.vt.T @ self.vt @ self.d_mat_i_inv)
        predict = U_2 + U_1
        predict[np.isnan(predict)] = 0
        return predict

    def recommend_top_k(self, interactions: sp.coo_matrix, k=20):
        """
        Предполагается нумерация айтемов с 0, иначе не будет работать argsort
        """
        ranks = self.predict(interactions)
        ranks[interactions.nonzero()] = -1e5  # exclude seen items
        return np.asarray(np.argsort(-ranks, axis=1)[:, :k])

### Amazon Books

In [182]:
#!c1.8
%%time
K = 20

gf_cf = GF_CF(amazon.train_interactions)
gf_cf.fit(64)

CPU times: user 2.63 s, sys: 632 ms, total: 3.26 s
Wall time: 10.8 s


In [183]:
#!c1.8
%%time
ranks = gf_cf.predict(amazon.observed_interactions)
recs = gf_cf.recommend_top_k(amazon.observed_interactions, K)

CPU times: user 4min 13s, sys: 1min 59s, total: 6min 12s
Wall time: 5min 12s


In [185]:
#!c1.8
%%time
# VALUATION
y_true = np.array([amazon.future_interactions[i].nonzero()[1] for i in range(amazon.n_test_users)], dtype='object')
print('====== GF-CF ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_score(np.array(ranks), np.array(amazon.future_interactions.todense())).round(5)}')

Precision = 0.15146
Recall = 0.07636
NDCG = 0.63409
CPU times: user 1min 48s, sys: 10.6 s, total: 1min 59s
Wall time: 1min 56s


In [191]:
#pragma async
#!c1.8

# %%time
# VALUATION
y_true = np.array([amazon.future_interactions[i].nonzero()[1] for i in range(amazon.n_test_users)], dtype='object')
print('====== GF-CF ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_score(np.array(ranks), np.array(amazon.future_interactions.todense())).round(5)}')

Background cell scheduled. Waiting for foreground cells to finish commits
Preparing c1.8 instance...
c1.8 instance is ready, running task...
Precision = 0.15146
Recall = 0.07636
NDCG = 0.63409
Task is done, waiting for foreground cells to finish...
Merging task result to the state


### MovieLens1M

In [193]:
#pragma async
#!c1.4
K = 20

gf_cf = GF_CF(mov.train_interactions)
gf_cf.fit(64)
ranks = gf_cf.predict(mov.observed_interactions)
recs = gf_cf.recommend_top_k(mov.observed_interactions, K)

# VALUATION
y_true = np.array([mov.future_interactions[i].nonzero()[1] for i in range(mov.n_test_users)], dtype='object')
# print(f'NDGC score = {ndcg_score(np.array(ranks), np.array(mov.future_interactions.todense())).round(5)}')
print('====== GF-CF ======')
print(f'Precision = {precision_at_k(recs, y_true).round(5)}')
print(f'Recall = {recall_at_k(recs, y_true).round(5)}')
print(f'NDCG = {ndcg_score(np.array(ranks), np.array(mov.future_interactions.todense())).round(5)}')

Background cell scheduled. Waiting for foreground cells to finish commits
Preparing c1.4 instance...
c1.4 instance is ready, running task...
Precision = 0.5741
Recall = 0.13183
NDCG = 0.76224
Task is done, waiting for foreground cells to finish...
Merging task result to the state


# LGCN-E

In [753]:
import torch

In [225]:
torch.version.cuda

'10.2'

In [224]:
torch.__version__

'1.11.0+cu102'

In [579]:
%pip uninstall torch_spline_conv

Found existing installation: torch-spline-conv 1.2.1
Uninstalling torch-spline-conv-1.2.1:
  Successfully uninstalled torch-spline-conv-1.2.1
[31mERROR: Exception:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/cli/base_command.py", line 164, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/commands/uninstall.py", line 102, in run
    uninstall_pathset.commit()
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/req/req_uninstall.py", line 435, in commit
    self._moved_paths.commit()
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/req/req_uninstall.py", line 288, in commit
    save_dir.cleanup()
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/temp_dir.py", line 173, in cleanup
    rmtree(self._path)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/tenacity/__init__.py", line 326, in wrapped_f
    return self(f, *args, **kw

In [572]:
%pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cu102.html

Defaulting to user installation because normal site-packages is not writeable
Looking in links: https://data.pyg.org/whl/torch-1.11.0+cu102.html
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [580]:
%pip install torch_spline_conv -f https://pytorch-geometric.com/whl/torch-1.11.0+cu102.html

Defaulting to user installation because normal site-packages is not writeable
Looking in links: https://pytorch-geometric.com/whl/torch-1.11.0+cu102.html
Collecting torch_spline_conv
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu102/torch_spline_conv-1.2.1-cp38-cp38-linux_x86_64.whl (672 kB)
     |████████████████████████████████| 672 kB 286 kB/s            
[?25hInstalling collected packages: torch-spline-conv
Successfully installed torch-spline-conv-1.2.1
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


Detected that PyTorch and torch_cluster were compiled with different CUDA versions. 

PyTorch has CUDA version 10.2 and torch_cluster has CUDA version 11.3. 

Please reinstall the torch_cluster that matches your PyTorch install.

In [754]:
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import degree, negative_sampling

In [755]:
import wandb
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

In [756]:
class SimpleProp(MessagePassing):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def forward(self, x, edge_index, size):
        return self.propagate(edge_index, x=x, size=size)

    def message(self, x_j):
        return x_j
    

class LGCN_U(torch.nn.Module):
    def __init__(self, n_users, emb_dim, alphas=(1, 1), normalize=False, **kwargs):
        super().__init__()
        self.m = n_users
        self.normalize = normalize
        self.alphas = alphas
        self.U_0 = torch.nn.Embedding(self.m, emb_dim)
        self.layer_1 = SimpleProp(**kwargs)  # E_1 = R.T @ U
        self.layer_2 = SimpleProp(**kwargs)  # U_2 = R @ E_1
        self.layer_3 = SimpleProp(**kwargs)  # E_3 = R @ U_2

    def get_embeddings(self, edge_index, n=None):
        """
        Propagate messages through the graph and return fused final embeddings of users ans items: U, E.

        Parameters:
        ----------
        edge_index : Tensor
            Edge tensor specifying the connectivity of the graph
        n : int (optional)
            Number of items

        Returns:
        -------
            U : Tensor
                User embeddings of shape (m, emb_dim)
            E : Tensor
                Item embeddings of shape (n, emb_dim)
        """
        if n is None:
            n = edge_index[1].max().item() + 1
        m = self.m

        if self.normalize:
            inv_sqrt_user_degrees = degree(edge_index[0]).pow(-0.5)
            inv_sqrt_item_degrees = degree(edge_index[1]).pow(-0.5)
            # inv_norm = inv_sqrt_user_degrees * inv_sqrt_item_degrees
        else:
            inv_sqrt_item_degrees = torch.Tensor(1)
            inv_sqrt_user_degrees = torch.Tensor(1)

        E_1 = self.layer_1.forward(self.U_0.weight, edge_index=edge_index, size=(m, n)) * inv_sqrt_item_degrees.view(-1, 1)
        U_2 = self.layer_2.forward(E_1, edge_index[[1, 0], :], size=(n, m)) * inv_sqrt_user_degrees.view(-1, 1)
        E_3 = self.layer_3.forward(U_2, edge_index, size=(m, n)) * inv_sqrt_item_degrees.view(-1, 1)

        E = 0.5 * (E_1 + E_3)
        U = 0.5 * (self.U_0.weight + U_2)
        return U, E

    def forward(self, edge_index, edge_label_index, n_items=None):
        """
        Computes rankings for pairs of nodes using learned user embeddings.

        Parameters
        ----------
        edge_index: Tensor
            Edge tensor specifying the connectivity of the graph
        edge_label_index: Tensor, optional
            Edge tensor specifying the node pairs for which to compute rankings or probabilities
        n_items: int (optional)
            Number of items

        Returns
        -------
        scores : Tensor
            Scores of edges of shape (edge_index_label, ).
        """
        if edge_label_index is None:
            edge_label_index = edge_index

        U, E = self.get_embeddings(edge_index, n=n_items)
        src = U[edge_label_index[0]]
        dst = E[edge_label_index[1]]
        return (src * dst).sum(dim=-1)

    def recommend_top_k(self, n):
        pass

### Amazon Books

In [757]:
class GraphDataset(AlsDataset):
    def build_interaction_graph(self):
        if not hasattr(self, '_df_train'):
            raise AttributeError('Perform train_test_split() before build_interaction_graph()')

        pos_edges = torch.LongTensor(self._df_train[['item_id', 'user_id']].values.T)

        test_observed_edges = torch.LongTensor(self._test_observed[['item_id', 'user_id']].values.T)
        test_observed_edges[1] += self.n_train_users

        test_future_edges = torch.LongTensor(self._test_future[['item_id', 'user_id']].values.T)
        test_future_edges[1] += self.n_train_users

        self.pos_edges = pos_edges
        self.test_observed_edges = test_observed_edges
        self.test_future_edges  = test_future_edges
        self.n_test_users = len(test_observed_edges[1].unique())

    def train_val_split(self, val_ratio=0.1):
        """
        Split edges so that all users presence both in train in validation splits.
        Cut specified ratio of edges for each user, providing balanced partitioning.

        Parameters:
        ----------
        val_ratio: float
            Share of validation edges in all positive edges of the given graph

        Returns:
        -------
        train_edges: LongTensor
        val_edges: LongTensor
        """
        torch.manual_seed(42)
        val_edges = []
        train_edges = []
        for user in self.pos_edges[0].unique():
            edges = self.pos_edges[:, self.pos_edges[0] == user]
            n_items = edges.shape[1]
            edges = edges[:, torch.randperm(n_items)]
            if n_items == 1:
                thr = 1
            else:
                thr = int(n_items * (1-val_ratio))
            train_edges.append(edges[:, :thr])
            val_edges.append(edges[:, thr:])

        val_edges = torch.cat(val_edges, dim=1)
        train_edges = torch.cat(train_edges, dim=1)
        # return train_edges, val_edges
        self.train_edges = train_edges
        self.val_edges = val_edges

In [812]:
def train_amazon(n_epoch):
    for i in tqdm(range(n_epoch)):
        opt.zero_grad()
        neg_train_edges = negative_sampling(train_edges, num_neg_samples=train_edges.shape[1], num_nodes=(model.m, amazon.n_train_users)).to(device)
        neg_val_edges = negative_sampling(pos_edges, num_neg_samples=val_edges.shape[1], num_nodes=(model.m, amazon.n_train_users)).to(device)

        train_scores = model.forward(edge_index=train_edges, edge_label_index=train_edges, n_items=amazon.n_train_users)
        neg_scores = model.forward(edge_index=train_edges, edge_label_index=neg_train_edges, n_items=amazon.n_train_users)

        val_scores = model.forward(edge_index=train_edges, edge_label_index=val_edges, n_items=amazon.n_train_users)
        neg_val_scores = model.forward(edge_index=train_edges, edge_label_index=neg_val_edges, n_items=amazon.n_train_users)

        # ==== TEST ====
        neg_test_edges = negative_sampling(_tfe, num_nodes=(model.m, amazon._test_observed['user_id'].nunique()))
        neg_test_edges[1] += amazon.n_train_users

        test_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
                                    edge_label_index=test_future_edges, n_items=test_observed_edges[1].max().item() + 1)

        neg_test_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
                                       edge_label_index=neg_test_edges, n_items=test_observed_edges[1].max().item() + 1)

        train_loss = torch.mean(F.softplus(neg_scores - train_scores))
        val_loss = torch.mean(F.softplus(neg_val_scores - val_scores))
        test_loss = torch.mean(F.softplus(neg_test_scores - test_scores))

        train_acc = (train_scores > neg_scores).float().mean()
        val_acc = (val_scores > neg_val_scores).float().mean()
        test_acc = (test_scores > neg_test_scores).float().mean()

#         if i % 3 == 0:
#             print(f'Train loss = {train_loss}, val_loss = {val_loss}, test_loss = {test_loss}, test_acc={test_acc}')
        train_loss.backward()
        opt.step()
        wandb.log({'loss': {'train': train_loss, 'val': val_loss, 'test': test_loss}, 
                   'acc': {'train': train_acc, 'val': val_acc, 'test': test_acc}})

In [759]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [760]:
amazon = GraphDataset('amazon_20_core.csv.gz', usecols=['user_id', 'item_id', 'rating'])
# print(amazon.df.nunique())

Sparsity = 99.856%
Users: 35736,    Items: 38121


In [761]:
%%time
amazon.train_test_split()
amazon.build_sparse_interaction_matrix()
amazon.build_interaction_graph()
amazon.train_val_split()

CPU times: user 2min 34s, sys: 592 ms, total: 2min 34s
Wall time: 1min 17s


In [None]:
model.m, (amazon.n_train_users, amazon.n_test_users)

In [800]:
#!g1.1
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass

<class 'torch.Tensor'> torch.Size([2, 1466055])
<class 'torch.Tensor'> torch.Size([2, 1302808])
<class 'torch.Tensor'> torch.Size([2, 163247])
<class 'torch.Tensor'> torch.Size([2, 49461])
<class 'torch.Tensor'> torch.Size([2, 436309])
<class 'torch.Tensor'> torch.Size([2, 436309])
<class 'torch.Tensor'> torch.Size([2, 1466055])
<class 'torch.Tensor'> torch.Size([2, 49461])
<class 'torch.Tensor'> torch.Size([2, 436309])
<class 'torch.Tensor'> torch.Size([2, 1302808])
<class 'torch.Tensor'> torch.Size([2, 163247])
<class 'torch.Tensor'> torch.Size([38121, 512])
<class 'torch.Tensor'> torch.Size([38121, 512])
<class 'torch.Tensor'> torch.Size([38121, 512])
<class 'torch.nn.parameter.Parameter'> torch.Size([38121, 512])
<class 'torch.Tensor'> torch.Size([19])


  if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):


In [818]:
#!g1.1
del model, new_scores, _U, _E
gc.collect()
torch.cuda.empty_cache()

In [821]:
#!g1.1
!nvidia-smi

Fri Jun 24 22:41:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:8C:00.0 Off |                    0 |
| N/A   42C    P0    48W / 300W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [827]:
#!g1.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

pos_edges = amazon.pos_edges.to(device)
train_edges = amazon.train_edges.to(device)
val_edges = amazon.val_edges.to(device)
test_observed_edges = amazon.test_observed_edges.to(device)
test_future_edges = amazon.test_future_edges.to(device)
_tfe = test_future_edges.clone().to(device)
_tfe[1] -= amazon.n_train_users

config = dict(lr = 3e-4, 
              emb_dim = 256,
              device = device
             )
wandb.init(project="course_work", entity="ilyaind", config=config, reinit=True)
model = LGCN_U(n_users=amazon.n_items, emb_dim=config['emb_dim'], normalize=True).to(device)
opt = torch.optim.Adam(model.parameters(), lr=config['lr'])
train_amazon(250)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33milyaind[0m. Use [1m`wandb login --relogin`[0m to force relogin


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [None]:
#!g1.1
with torch.no_grad():
    _U, _E = model.get_embeddings(torch.cat([pos_edges, test_observed_edges], dim=1))
    # _U, _E = model.get_embeddings(pos_edges)
print(_U.shape, _E.shape)

new_scores = _E @ _U.T
new_scores[test_observed_edges[1], test_observed_edges[0]] = -1e5  # exclude seen items
new_scores = new_scores[amazon.n_train_users:]
print(new_scores.shape)

ranks, recs = new_scores.topk(20, dim=1)
print(recs.shape)

y_true = amazon._test_future.groupby('user_id')['item_id'].unique().values
# y_true = mov.train.groupby('user_id')['item_id'].unique().values
print('===== LGCN-E =====')
print(f'Precsion = {precision_at_k(recs.cpu(), y_true).round(5)}')
print(f'Recall = {recall_at_k(recs.cpu(), y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks.cpu().numpy(), recs.cpu(), amazon.future_interactions).round(5)}')

#!g1.1
wandb.run.summary["test_precision"] = precision_at_k(recs.cpu(), y_true)
wandb.run.summary["test_recall"] = recall_at_k(recs.cpu(), y_true)

In [829]:
#!g1.1
y_true = amazon._test_future.groupby('user_id')['item_id'].unique().values
# y_true = mov.train.groupby('user_id')['item_id'].unique().values
print('===== LGCN-E =====')
print(f'Precsion = {precision_at_k(recs.cpu(), y_true).round(5)}')
print(f'Recall = {recall_at_k(recs.cpu(), y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks.cpu().numpy(), recs.cpu(), amazon.future_interactions).round(5)}')

#!g1.1
wandb.run.summary["test_precision"] = precision_at_k(recs.cpu(), y_true)
wandb.run.summary["test_recall"] = recall_at_k(recs.cpu(), y_true)

===== LGCN-E =====
Precsion = 0.12641
Recall = 0.06218
NDCG = 0.41926


```
====== ALS ======
Precision = 0.13436
Recall = 0.06331
NDCG = 0.42584

====== GF-CF ======
Precision = 0.15146
Recall = 0.07636
NDCG = 0.63409
```

## MovieLens 1M

In [490]:
mov = GraphDataset('ml-1m/ratings.dat', usecols=[0, 1, 2], header=None, sep='::', engine='python')

Sparsity = 95.532%


In [491]:
mov.train_test_split()
mov.build_sparse_interaction_matrix()
mov.build_interaction_graph()
mov.train_val_split()

pos_edges = mov.pos_edges
train_edges = mov.train_edges
val_edges = mov.val_edges
test_observed_edges = mov.test_observed_edges
test_future_edges = mov.test_future_edges
_tfe = test_future_edges.clone()
_tfe[1] -= mov.n_train_users

In [567]:
#!g1.1
model = LGCN_U(n_users=mov.n_items, emb_dim=256, normalize=True).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
metrics = []
acc_list = []

In [494]:
model.m, (mov.n_train_users, mov.n_test_users)

(3681, (4530, 1490))

In [566]:
#!g1.1
wandb.init(project="course_work", entity="ilyaind", reinit=True, name='emb_size_256')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33milyaind[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [516]:
#!g1.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [568]:
#!g1.1
pos_edges = mov.pos_edges.to(device)
train_edges = mov.train_edges.to(device)
val_edges = mov.val_edges.to(device)
test_observed_edges = mov.test_observed_edges.to(device)
test_future_edges = mov.test_future_edges.to(device)
_tfe = test_future_edges.clone().to(device)
_tfe[1] -= mov.n_train_users

In [569]:
#!g1.1
val_edges.device

device(type='cuda', index=0)

In [582]:
#!g1.1
for i in tqdm(range(20)):
    opt.zero_grad()
    neg_train_edges = negative_sampling(train_edges, num_neg_samples=train_edges.shape[1], num_nodes=(model.m, mov.n_train_users)).to(device)
    neg_val_edges = negative_sampling(pos_edges, num_neg_samples=val_edges.shape[1], num_nodes=(model.m, mov.n_train_users)).to(device)

    train_scores = model.forward(edge_index=train_edges, edge_label_index=train_edges, n_items=mov.n_train_users)
    neg_scores = model.forward(edge_index=train_edges, edge_label_index=neg_train_edges, n_items=mov.n_train_users)

    val_scores = model.forward(edge_index=train_edges, edge_label_index=val_edges, n_items=mov.n_train_users)
    neg_val_scores = model.forward(edge_index=train_edges, edge_label_index=neg_val_edges, n_items=mov.n_train_users)

    #  ==== VALIDATION ====
    # _toe = test_observed_edges.clone()
    # _toe[1] -= mov.train['user_id'].nunique()
    # neg_val_edges = utils.negative_sampling(_toe, num_nodes=(model.m, mov.test_observed['user_id'].nunique()))
    # neg_val_edges[1] += mov.train['user_id'].nunique()
    # val_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
    #                            edge_label_index=test_observed_edges, n_items=test_observed_edges[1].max().item() + 1)
    # neg_val_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
    #                                edge_label_index=neg_val_edges, n_items=test_observed_edges[1].max().item() + 1)

    # ==== TEST ====
    neg_test_edges = negative_sampling(_tfe, num_nodes=(model.m, mov._test_observed['user_id'].nunique()))
    neg_test_edges[1] += mov.n_train_users
    
    test_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
                                edge_label_index=test_future_edges, n_items=test_observed_edges[1].max().item() + 1)
    
    neg_test_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
                                   edge_label_index=neg_test_edges, n_items=test_observed_edges[1].max().item() + 1)

    train_loss = torch.mean(F.softplus(neg_scores - train_scores))
    val_loss = torch.mean(F.softplus(neg_val_scores - val_scores))
    test_loss = torch.mean(F.softplus(neg_test_scores - test_scores))

    train_acc = (train_scores > neg_scores).float().mean()
    val_acc = (val_scores > neg_val_scores).float().mean()
    test_acc = (test_scores > neg_test_scores).float().mean()

    if i % 1 == 0:
        print(f'Train loss = {train_loss}, val_loss = {val_loss}, test_loss = {test_loss}, test_acc={test_acc}')
    train_loss.backward()
    opt.step()
    metrics.append([train_loss.item(), val_loss.item(), test_loss.item()])
    acc_list.append([train_acc.item(), val_acc.item(), test_acc.item()])
#     writer.add_scalars("Run", {'train_acc': train_acc.item(), 'val_acc': val_acc.item(), 'test_acc': test_acc.item()})
    wandb.log({'loss': {'train': train_loss, 'val': val_loss, 'test': test_loss}, 
               'acc': {'train': train_acc, 'val': val_acc, 'test': test_acc}})

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

Error: You must call wandb.init() before wandb.log()

Background cell scheduled. Waiting for foreground cells to finish commits
Preparing g1.1 instance...
g1.1 instance is ready, running task...
Train loss = 2797.67529296875, val_loss = 2874.735595703125, test_loss = 1251.80810546875, test_acc=0.8893024325370789

Task is done, waiting for foreground cells to finish...
Merging task result to the state


In [744]:
#!g1.1
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-32GB'

In [589]:
#!g1.1
with torch.no_grad():
    _U, _E = model.get_embeddings(torch.cat([pos_edges, test_observed_edges], dim=1))
    # _U, _E = model.get_embeddings(pos_edges)
print(_U.shape, _E.shape)

new_scores = _E @ _U.T
new_scores[test_observed_edges[1], test_observed_edges[0]] = -1e5  # exclude seen items
new_scores = new_scores[mov.n_train_users:]
print(new_scores.shape)

ranks, recs = new_scores.topk(20, dim=1)
recs.shape

torch.Size([3681, 256]) torch.Size([6020, 256])
torch.Size([1490, 3681])


torch.Size([1490, 20])

In [590]:
#!g1.1
y_true = mov._test_future.groupby('user_id')['item_id'].unique().values
# y_true = mov.train.groupby('user_id')['item_id'].unique().values
print('===== LGCN-E =====')
print(f'Precsion = {precision_at_k(recs.cpu(), y_true).round(5)}')
print(f'Recall = {recall_at_k(recs.cpu(), y_true).round(5)}')
print(f'NDCG = {ndcg_at_k(ranks.cpu().numpy(), recs.cpu(), mov.future_interactions).round(5)}')

===== LGCN-E =====
Precsion = 0.53097
Recall = 0.11643
NDCG = 0.80196


```
====== ALS ======
Precision = 0.5245
Recall = 0.11568
NDCG = 0.80066

====== GF-CF ======
Precision = 0.5741
Recall = 0.13183
NDCG = 0.76224
```

In [586]:
#pragma async
#!g1.1

model = LGCN_U(n_users=mov.n_items, emb_dim=256, normalize=True).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
metrics = []
acc_list = []

wandb.init(project="course_work", entity="ilyaind", reinit=True, name='emb_256_less_lr')

pos_edges = mov.pos_edges.to(device)
train_edges = mov.train_edges.to(device)
val_edges = mov.val_edges.to(device)
test_observed_edges = mov.test_observed_edges.to(device)
test_future_edges = mov.test_future_edges.to(device)
_tfe = test_future_edges.clone().to(device)
_tfe[1] -= mov.n_train_users

for i in tqdm(range(150)):
    opt.zero_grad()
    neg_train_edges = negative_sampling(train_edges, num_neg_samples=train_edges.shape[1], num_nodes=(model.m, mov.n_train_users)).to(device)
    neg_val_edges = negative_sampling(pos_edges, num_neg_samples=val_edges.shape[1], num_nodes=(model.m, mov.n_train_users)).to(device)

    train_scores = model.forward(edge_index=train_edges, edge_label_index=train_edges, n_items=mov.n_train_users)
    neg_scores = model.forward(edge_index=train_edges, edge_label_index=neg_train_edges, n_items=mov.n_train_users)

    val_scores = model.forward(edge_index=train_edges, edge_label_index=val_edges, n_items=mov.n_train_users)
    neg_val_scores = model.forward(edge_index=train_edges, edge_label_index=neg_val_edges, n_items=mov.n_train_users)

    # ===== TEST =====
    neg_test_edges = negative_sampling(_tfe, num_nodes=(model.m, mov._test_observed['user_id'].nunique()))
    neg_test_edges[1] += mov.n_train_users
    
    test_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
                                edge_label_index=test_future_edges, n_items=test_observed_edges[1].max().item() + 1)
    
    neg_test_scores = model.forward(edge_index=torch.cat([pos_edges, test_observed_edges], dim=1),
                                   edge_label_index=neg_test_edges, n_items=test_observed_edges[1].max().item() + 1)

    train_loss = torch.mean(F.softplus(neg_scores - train_scores))
    val_loss = torch.mean(F.softplus(neg_val_scores - val_scores))
    test_loss = torch.mean(F.softplus(neg_test_scores - test_scores))

    train_acc = (train_scores > neg_scores).float().mean()
    val_acc = (val_scores > neg_val_scores).float().mean()
    test_acc = (test_scores > neg_test_scores).float().mean()

    train_loss.backward()
    opt.step()
    metrics.append([train_loss.item(), val_loss.item(), test_loss.item()])
    acc_list.append([train_acc.item(), val_acc.item(), test_acc.item()])
    wandb.log({'loss': {'train': train_loss, 'val': val_loss, 'test': test_loss}, 
               'acc': {'train': train_acc, 'val': val_acc, 'test': test_acc}})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33milyaind[0m. Use [1m`wandb login --relogin`[0m to force relogin


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=150.0), HTML(value='')))

Background cell scheduled. Waiting for foreground cells to finish commits
Preparing g1.1 instance...
g1.1 instance is ready, running task...
Train loss = 2937.339599609375, val_loss = 3057.262939453125, test_loss = 1322.332275390625, test_acc=0.8892236948013306
Train loss = 2764.3154296875, val_loss = 2771.38623046875, test_loss = 1255.738525390625, test_acc=0.8895561099052429
Train loss = 2639.123046875, val_loss = 2666.694091796875, test_loss = 1166.1314697265625, test_acc=0.8902121186256409
Train loss = 2478.52392578125, val_loss = 2514.81884765625, test_loss = 1107.62939453125, test_acc=0.8897135257720947
Train loss = 2328.64404296875, val_loss = 2356.132080078125, test_loss = 1050.2127685546875, test_acc=0.8902384042739868
Train loss = 2190.9736328125, val_loss = 2220.945556640625, test_loss = 986.4285278320312, test_acc=0.8906363844871521
Train loss = 2067.589599609375, val_loss = 2094.33984375, test_loss = 937.4827270507812, test_acc=0.8909600377082825
Train loss = 1935.16662597

In [588]:
#!g1.1
%apply_state 287ee3b8-7b49-46de-aaf3-f5547ec88ade/487ebe10-581d-4000-af18-ac589822c7f4

Error: invalid apply state command: %apply_state 287ee3b8-7b49-46de-aaf3-f5547ec88ade/487ebe10-581d-4000-af18-ac589822c7f4

In [None]:
#!g1.1
