## Imports

In [1]:
import pathlib
import os
import os.path as osp
import sys
import argparse
parent_path = pathlib.Path(os.getcwd()).parent.absolute()
sys.path.append(str(parent_path))

import torch
import torch.nn.functional as F
from torch.nn import Linear
import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.loader import NeighborLoader, LinkNeighborLoader

from utils.Neo4jMovieLensMetaData import Neo4jMovieLensMetaData
# from utils.gnn_simple import Model
from utils.train_test import train_test
from utils.visualize import plot_loss, plot_train, plot_val, plot_test, plot_results

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Read the corresponsing csv, store the dataset to the DB, preprocess it, and get it as a pytorch graph object

In [3]:
path = osp.join(osp.dirname(osp.abspath('')), '../../data/MovieLensNeo4j')
dataset = Neo4jMovieLensMetaData(
    path,
    model_name='all-MiniLM-L6-v2',
    database_url="bolt://localhost:7687",
    database_username="neo4j",
    database_password="admin",
    force_pre_process=True,
    force_db_restore=False,
    text_features=["title"],
    list_features=[],
    fastRP_features=[],
    numeric_features=[],
)
data = dataset[0].to(device)

Processing...


Movies have features...
Encoding title...


Batches: 100%|██████████| 565/565 [00:15<00:00, 35.56it/s]


[torch.Size([18062, 64])]


Done!


#### Preprocess the dataset

In [4]:
# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing:
data = T.ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

In [5]:
data["user", "rates", "movie"].edge_index

tensor([[    0,     0,     0,  ..., 16235, 16235, 16235],
        [ 1045,  5556,   813,  ...,  3238,  1187,  1056]])

In [6]:
train_loader = LinkNeighborLoader(
    data=train_data,
    # Sample ALL neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[-1],
    # Use a batch size of 128 for sampling training nodes of type "paper":
    batch_size=256,
    edge_label_index = (("user", "movie"), None),
    edge_label = data['user', 'movie'].edge_label,
)
train_batch = next(iter(train_loader))

In [7]:
train_batch

HeteroData(
  [1mmovie[0m={ x=[5609, 64] },
  [1muser[0m={ x=[16141, 16236] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 384729],
    edge_label=[256],
    edge_label_index=[2, 256]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 40685] }
)

In [8]:
data['user', 'rates', 'movie'].edge_label.shape

torch.Size([2660790])

In [9]:
torch.arange(0,data["user", "rates", "movie"].edge_index.shape[1]).shape

torch.Size([2660790])

#### Define and train-test the model

In [10]:
z_dict_test_lala = 0
edge_label_index_test_lala = 0

In [11]:
import torch
from torch import Tensor
from torch.nn import Linear, LazyLinear, Sequential, BatchNorm1d, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GATv2Conv, GCNConv, TransformerConv, GraphConv, GINConv, GINEConv, to_hetero, HeteroLinear, HeteroConv
from torch_geometric.nn.models import GIN, GraphSAGE
from torch_geometric.nn.aggr import MultiAggregation
from typing import Union
from torch_geometric.typing import Adj, OptPairTensor, Size


class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        global z_dict_test_lala, edge_label_index_test_lala
        row, col = edge_label_index
        z_dict_test_lala = z_dict
        edge_label_index_test_lala = edge_label_index
        movie = z_dict['movie'][col]
        user = z_dict['user'][row]
        z = torch.cat([user, movie], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [12]:
model = Model(hidden_channels=16).to(device)

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.012)

weight = torch.bincount(train_data['user', 'movie'].edge_label)
# weight = torch.bincount(train_data['user', 'rates', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    # weight = 1. if weight is None else weight[target].to(pred.dtype)
    weight = 1. # if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [14]:
def train(batch, log=False):
    model.train()
    optimizer.zero_grad()
    # pred = model(batch.x_dict, batch.edge_index_dict, batch['user', 'rates', 'movie'].edge_label_index)
    pred = model(batch.x_dict, batch.edge_index_dict, batch['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = batch['user', 'movie'].edge_label.float()
    # target = batch['user', 'rates', 'movie'].edge_label
    # print("pred:", pred)
    # print("target:", target)
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data, log=False):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict, data['user', 'movie'].edge_label_index)
    # pred = model(data.x_dict, data.edge_index_dict, data['user', 'rates', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    # target = data['user', 'rates', 'movie'].edge_label.float()

    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [15]:
epochs = 2
losses = []
for epoch in range(1, epochs+1):
    batch_index = 0
    for batch in train_loader:
        print("Batch index:", batch_index)
        # loss = 0.0
        loss = train(batch, log=not(epoch%20))
        # train_rmse = test(batch)
        # val_rmse = test(val_data)
        # test_rmse = test(test_data, log=not(epoch%20))
        # losses.append((loss, train_rmse, val_rmse, test_rmse))
        losses.append((loss, 0, 0, 0))
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {0.0:.4f}, '
                f'Val: {0.0:.4f}, Test: {0.0:.4f}')
        batch_index += 1
last_losses = losses[-1]
losses = losses + [last_losses] * (epochs - len(losses))

Batch index: 0
Epoch: 001, Loss: 13.7034, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 1
Epoch: 001, Loss: 14.5806, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 2
Epoch: 001, Loss: 12.2936, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 3
Epoch: 001, Loss: 12.1377, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 4
Epoch: 001, Loss: 13.5119, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 5
Epoch: 001, Loss: 13.0411, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 6
Epoch: 001, Loss: 12.1706, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 7
Epoch: 001, Loss: 11.8627, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 8
Epoch: 001, Loss: 10.2295, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 9
Epoch: 001, Loss: 8.8391, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 10
Epoch: 001, Loss: 8.9862, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch index: 11
Epoch: 001, Loss: 6.8138, Train: 0.0000, Val: 0.0000, Test: 0.0000
Batch

KeyboardInterrupt: 

In [85]:
z_dict_test_lala

{'movie': tensor([[ 0.0761,  0.1135,  0.0690,  ..., -0.2256,  0.2134, -0.1368],
         [ 0.0435,  0.0875,  0.0618,  ..., -0.2305,  0.2250, -0.1418],
         [ 0.0662,  0.0826,  0.0691,  ..., -0.2216,  0.2238, -0.1587],
         ...,
         [ 0.1111,  0.1490,  0.0663,  ..., -0.2371,  0.1986, -0.0976],
         [ 0.0922,  0.1169,  0.0603,  ..., -0.2199,  0.1890, -0.1057],
         [ 0.0715,  0.1584,  0.0413,  ..., -0.2457,  0.2041, -0.1142]],
        grad_fn=<AddBackward0>),
 'user': tensor([[ 0.0488, -0.0851,  0.0284,  ...,  0.1463, -0.0836,  0.0671],
         [ 0.0509, -0.0810,  0.0300,  ...,  0.1481, -0.0808,  0.0725],
         [ 0.0518, -0.0774,  0.0278,  ...,  0.1422, -0.0780,  0.0548],
         ...,
         [ 0.0553, -0.0726,  0.0294,  ...,  0.1361, -0.0786,  0.0700],
         [ 0.0517, -0.0849,  0.0192,  ...,  0.1429, -0.0850,  0.0679],
         [ 0.0475, -0.0816,  0.0279,  ...,  0.1458, -0.0839,  0.0664]],
        grad_fn=<AddBackward0>)}

In [86]:
z_dict_test_lala["movie"]

tensor([[ 0.0761,  0.1135,  0.0690,  ..., -0.2256,  0.2134, -0.1368],
        [ 0.0435,  0.0875,  0.0618,  ..., -0.2305,  0.2250, -0.1418],
        [ 0.0662,  0.0826,  0.0691,  ..., -0.2216,  0.2238, -0.1587],
        ...,
        [ 0.1111,  0.1490,  0.0663,  ..., -0.2371,  0.1986, -0.0976],
        [ 0.0922,  0.1169,  0.0603,  ..., -0.2199,  0.1890, -0.1057],
        [ 0.0715,  0.1584,  0.0413,  ..., -0.2457,  0.2041, -0.1142]],
       grad_fn=<AddBackward0>)

In [87]:
z_dict_test_lala["user"]

tensor([[ 0.0488, -0.0851,  0.0284,  ...,  0.1463, -0.0836,  0.0671],
        [ 0.0509, -0.0810,  0.0300,  ...,  0.1481, -0.0808,  0.0725],
        [ 0.0518, -0.0774,  0.0278,  ...,  0.1422, -0.0780,  0.0548],
        ...,
        [ 0.0553, -0.0726,  0.0294,  ...,  0.1361, -0.0786,  0.0700],
        [ 0.0517, -0.0849,  0.0192,  ...,  0.1429, -0.0850,  0.0679],
        [ 0.0475, -0.0816,  0.0279,  ...,  0.1458, -0.0839,  0.0664]],
       grad_fn=<AddBackward0>)

In [88]:
row, col = edge_label_index_test_lala

In [89]:
movie = z_dict_test_lala["movie"]
user = z_dict_test_lala["user"]

In [90]:
col.shape

torch.Size([97347])

In [92]:
movie[col]

tensor([[ 0.0761,  0.1135,  0.0690,  ..., -0.2256,  0.2134, -0.1368],
        [ 0.0761,  0.1135,  0.0690,  ..., -0.2256,  0.2134, -0.1368],
        [ 0.0761,  0.1135,  0.0690,  ..., -0.2256,  0.2134, -0.1368],
        ...,
        [ 0.0567,  0.0945,  0.0843,  ..., -0.2181,  0.2255, -0.1466],
        [ 0.0567,  0.0945,  0.0843,  ..., -0.2181,  0.2255, -0.1466],
        [ 0.0567,  0.0945,  0.0843,  ..., -0.2181,  0.2255, -0.1466]],
       grad_fn=<IndexBackward0>)

In [91]:
row.shape

torch.Size([97347])

In [93]:
user[row]

IndexError: index 14980 is out of bounds for dimension 0 with size 14979