## Imports

In [1]:
import pathlib
import os
import os.path as osp
import sys
import argparse
parent_path = pathlib.Path(os.getcwd()).parent.absolute()
sys.path.append(str(parent_path))

import torch
import torch.nn.functional as F
from torch.nn import Linear
import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.loader import NeighborLoader, LinkNeighborLoader

from utils.Neo4jMovieLensMetaData import Neo4jMovieLensMetaData
# from utils.gnn_simple import Model
from utils.train_test import train_test
from utils.visualize import plot_loss, plot_train, plot_val, plot_test, plot_results

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Read the corresponsing csv, store the dataset to the DB, preprocess it, and get it as a pytorch graph object

In [3]:
path = osp.join(osp.dirname(osp.abspath('')), '../../data/MovieLensNeo4j')
dataset = Neo4jMovieLensMetaData(
    path,
    model_name='all-MiniLM-L6-v2',
    database_url="bolt://localhost:7687",
    database_username="neo4j",
    database_password="admin",
    force_pre_process=True,
    force_db_restore=False,
    text_features=["title"],
    list_features=[],
    fastRP_features=[],
    numeric_features=[],
)
data = dataset[0].to(device)

Processing...


Movies have features...
Encoding title...


Batches: 100%|██████████| 565/565 [00:16<00:00, 34.44it/s]


[torch.Size([18062, 64])]


Done!


#### Preprocess the dataset

In [4]:
# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing:
data = T.ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

In [13]:
train_loader = LinkNeighborLoader(
    data=train_data,
    # Sample ALL neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[-1],
    # Use a batch size of 128 for sampling training nodes of type "paper":
    batch_size=64,
    edge_label_index = (("user", "movie"), None),
    edge_label = data['user', 'movie'].edge_label,
)
train_batch = next(iter(train_loader))

In [6]:
train_batch

HeteroData(
  [1mmovie[0m={ x=[3023, 64] },
  [1muser[0m={ x=[15769, 16236] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 109532],
    edge_label=[64],
    edge_label_index=[2, 64]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 10395] }
)

#### Define and train-test the model

In [7]:
z_dict_test_lala = 0
edge_label_index_test_lala = 0

In [8]:
import torch
from torch import Tensor
from torch.nn import Linear, LazyLinear, Sequential, BatchNorm1d, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GATv2Conv, GCNConv, TransformerConv, GraphConv, GINConv, GINEConv, to_hetero, HeteroLinear, HeteroConv
from torch_geometric.nn.models import GIN, GraphSAGE
from torch_geometric.nn.aggr import MultiAggregation
from typing import Union
from torch_geometric.typing import Adj, OptPairTensor, Size


class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        global z_dict_test_lala, edge_label_index_test_lala
        row, col = edge_label_index
        z_dict_test_lala = z_dict
        edge_label_index_test_lala = edge_label_index
        movie = z_dict['movie'][col]
        user = z_dict['user'][row]
        z = torch.cat([user, movie], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [14]:
model = Model(hidden_channels=16).to(device)

In [15]:
with torch.no_grad():
    model.encoder(train_batch.x_dict, train_batch.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.012)

weight = torch.bincount(train_data['user', 'movie'].edge_label)
# weight = torch.bincount(train_data['user', 'rates', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    # weight = 1. if weight is None else weight[target].to(pred.dtype)
    weight = 1. # if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [11]:
def train(batch, log=False):
    model.train()
    optimizer.zero_grad()

    pred = model(batch.x_dict, batch.edge_index_dict, batch['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = batch['user', 'movie'].edge_label.float()

    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(batch, log=False):
    model.eval()
    pred = model(batch.x_dict, batch.edge_index_dict, batch['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = batch['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [16]:
epochs = 2
losses = []
for epoch in range(1, epochs+1):
    batch_index = 0
    for batch in train_loader:
        loss = train(batch, log=not(epoch%20))
        train_rmse = test(batch)
        # val_rmse = test(val_data)
        # test_rmse = test(test_data, log=not(epoch%20))
        # losses.append((loss, train_rmse, val_rmse, test_rmse))
        losses.append((loss, train_rmse, 0, 0))
        if not batch_index % 10:
                print(f'Epoch: {epoch:03d}, Batch {batch_index:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
                        f'Val: {0.0:.4f}, Test: {0.0:.4f}')
        batch_index += 1
last_losses = losses[-1]
losses = losses + [last_losses] * (epochs - len(losses))

Epoch: 001, Batch 000, Loss: 10.9410, Train: 3.1711, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 010, Loss: 4.4850, Train: 2.1251, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 020, Loss: 2.1264, Train: 1.4500, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 030, Loss: 1.3298, Train: 1.1170, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 040, Loss: 1.7115, Train: 1.2723, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 050, Loss: 1.1343, Train: 1.0498, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 060, Loss: 1.6298, Train: 1.2272, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 070, Loss: 1.9519, Train: 1.3550, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 080, Loss: 0.8947, Train: 0.8901, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 090, Loss: 1.3780, Train: 1.0288, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 100, Loss: 1.2808, Train: 1.2014, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 110, Loss: 0.5822, Train: 0.7226, Val: 0.0000, Test: 0.0000
Epoch: 001, Batch 120, Loss: 1.3662, Train: 1.1831, Val: 0.0000

KeyboardInterrupt: 