# First approach on GNNs

## Initialize the DB connection

In [48]:
from py2neo import Graph

graph = Graph(
    # "bolt://0.0.0.0:7687",
    "bolt://localhost:7687",
    # "http://0.0.0.0:7687",
    # "bolt+s://localhost:7687",
    auth=("neo4j", "admin"),
    # name="neo4j"
)

try:
    graph.run("Match () Return 1 Limit 1")
    print('ok')
except Exception as e:
    print(e)
    print(e.with_traceback)
    print('not ok')

ok


## Build PyG graph instance

### Imports

In [49]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch.nn import Linear
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Helper methods

In [51]:
def fetch_data(query):
    result = graph.run(query)
    return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [52]:
# https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/load_csv.py
# https://towardsdatascience.com/integrate-neo4j-with-pytorch-geometric-to-create-recommendations-21b0b7bc9aa

def load_node(query, index_col, encoders=None):
    df = fetch_data(query)
    df.set_index(index_col, inplace=True)
    
    # define node mappings node: index
    mapping = { index: i for i, index in enumerate(df.index.unique()) }
    
    # define node features
    x = None
    if encoders is not None:
        try:
            print("items:", encoders.items())
            xs = [encoder(df[col]) for col, encoder in encoders.items()]
            x = torch.cat(xs, dim=-1)
        except Exception as err:
            print(err)
            print("------------------------------")
            print(df)
            print("------------------------------")
            raise Exception
    return x, mapping

In [53]:
# https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/load_csv.py
# https://towardsdatascience.com/integrate-neo4j-with-pytorch-geometric-to-create-recommendations-21b0b7bc9aa

def load_edge(query, src_index_col, src_mapping, dst_index_col, dst_mapping, encoders=None):
    df = fetch_data(query)
    
    # define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
       
    # define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)
    
    return edge_index, edge_attr

### Encoders

In [54]:
# https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/load_csv.py

class SequenceEncoder(object):
    # The 'SequenceEncoder' encodes raw column strings into embeddings.
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [55]:
# https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/load_csv.py
# https://colab.research.google.com/github/tomasonjo/blogs/blob/master/pyg2neo/Movie_recommendations.ipynb#scrollTo=n1QiEzKZ348J

class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None, is_list=False):
        self.dtype = dtype
        self.is_list = is_list

    def __call__(self, df):
        if self.is_list:
            return torch.stack([torch.tensor(el) for el in df.values])
        return torch.from_numpy(df.values).to(self.dtype)

### Users nodes

In [56]:
users_query = """
    MATCH (u:User) RETURN u.id as userId, u.name as name
"""

user_x, user_mapping = load_node(
    users_query,
    index_col='userId',
    encoders={
        'name': SequenceEncoder(),
    }
)

items: dict_items([('name', <__main__.SequenceEncoder object at 0x0000027320A638E0>)])


Batches:   0%|          | 0/2167 [00:00<?, ?it/s]

In [57]:
user_mapping

{'A5TA1NJOC0PJ5': 0,
 'A1TJICB7VLGQKL': 1,
 'A80M2286B7STE': 2,
 'A2TM2MAA8IT34U': 3,
 'AQGWT465GTEJG': 4,
 'AA0ACUPEPT1RN': 5,
 'A1JM3NL9GSQ17N': 6,
 'A3C5DYTUXUCL0S': 7,
 'A2H3NOCLLEPOKQ': 8,
 'A3A2HRW0GR4EWU': 9,
 'A16UTJN8DODY6N': 10,
 'A2P0254ZZ42ZOJ': 11,
 'ACB9GSMY2GY0O': 12,
 'A3DNSZT2HIB30J': 13,
 'A2Q8QWVIZM213M': 14,
 'ABVND860D55AJ': 15,
 'A267CAHUU3HOPQ': 16,
 'ANNS6DSY390Z2': 17,
 'A7DS7JE3D53XW': 18,
 'ASZO5WJA6FVVH': 19,
 'A1KU7EEUGYOJYD': 20,
 'A1907XNEJPEZEI': 21,
 'A3U7B0EKSK16MQ': 22,
 'A3OCOP75PMY097': 23,
 'AS02PGU8BFX2': 24,
 'A1TYR5GO9YE60D': 25,
 'AFO1CM5HMAEA6': 26,
 'A2J6AVSVGGFEJP': 27,
 'A18WA88B995ZT7': 28,
 'A36WKQ76TJCCYW': 29,
 'A1S0WV2WXFZRKI': 30,
 'ASCTK6A1JGPM8': 31,
 'AFJE85Y0YPOVY': 32,
 'A6D30HQK1YJE8': 33,
 'AE1LNE5A2BO74': 34,
 'AOOZO1BRJRCVY': 35,
 'A1I6F8COVMNYAH': 36,
 'A1V0MU46D12JST': 37,
 'AMDPRJ5R8DSUO': 38,
 'A3UGHH4IFF5CV3': 39,
 'A3G5ST2S43IB65': 40,
 'ARNM7L2CEQZHU': 41,
 'A2T0L8BI7IEH5Q': 42,
 'A3AX4BN9YVIU5J': 43,
 'A1PHTBLZAQWWG':

### Products nodes

In [58]:
products_query = """
    MATCH (p: Product)
    return  p.asin as asin, p.title as title, p.category as category, p.brand as brand,
            p["fastrp-embedding"] as `fastrp-embedding`,
            p["node2vec-embedding"] as `node2vec-embedding`
"""

products_x, products_mapping = load_node(
    products_query,
    index_col='asin',
    encoders= {
        'category': SequenceEncoder(),
        'fastrp-embedding': IdentityEncoder(is_list=True),
        'node2vec-embedding': IdentityEncoder(is_list=True),
    }
)

items: dict_items([('category', <__main__.SequenceEncoder object at 0x0000027320AE3700>), ('fastrp-embedding', <__main__.IdentityEncoder object at 0x00000273192878B0>), ('node2vec-embedding', <__main__.IdentityEncoder object at 0x0000027320A81900>)])


Batches:   0%|          | 0/61 [00:00<?, ?it/s]

### Ratings edges

In [59]:
reviews_query = """
    MATCH (u:User)-[r:REVIEWS]-(p:Product)
    return u.id as userId, r.overall as rating, p.asin as asin
"""

edge_index, edge_label = load_edge(
    reviews_query,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='asin',
    dst_mapping=products_mapping,
    encoders={
        'rating': IdentityEncoder(dtype=torch.long),
    }
)

In [60]:
edge_index.shape

torch.Size([2, 71672])

### PyG Graph building

#### Initialize Data

In [61]:
data = HeteroData()

# users nodes
data['user'].x = user_x

# products nodes
data['product'].x = products_x
data['user', 'reviews', 'product'].edge_index = edge_index
data['user', 'reviews', 'product'].edge_label = edge_label
data.to(device, non_blocking=True)
print(data)

HeteroData(
  [1muser[0m={ x=[69328, 384] },
  [1mproduct[0m={ x=[1941, 640] },
  [1m(user, reviews, product)[0m={
    edge_index=[2, 71672],
    edge_label=[71672]
  }
)


#### Convert them to appropriate format for training a graph-based ML model

In [62]:
# 1. Add a reverse ('product', 'rev_reviews', 'user') relation for message passing.

# data = T.ToUndirected()(data)
data = ToUndirected()(data)
del data['product', 'rev_reviews', 'user'].edge_label

# 2. Perform a link-level split into training, validation, and test edges.

transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'reviews', 'product')],
    rev_edge_types=[('product', 'rev_reviews', 'user')],
)
train_data, val_data, test_data = transform(data)

In [63]:
print(train_data)

HeteroData(
  [1muser[0m={ x=[69328, 384] },
  [1mproduct[0m={ x=[1941, 640] },
  [1m(user, reviews, product)[0m={
    edge_index=[2, 57338],
    edge_label=[57338],
    edge_label_index=[2, 57338]
  },
  [1m(product, rev_reviews, user)[0m={ edge_index=[2, 57338] }
)


In [64]:
print(test_data)

HeteroData(
  [1muser[0m={ x=[69328, 384] },
  [1mproduct[0m={ x=[1941, 640] },
  [1m(user, reviews, product)[0m={
    edge_index=[2, 64505],
    edge_label=[7167],
    edge_label_index=[2, 7167]
  },
  [1m(product, rev_reviews, user)[0m={ edge_index=[2, 64505] }
)


In [65]:
print(val_data)

HeteroData(
  [1muser[0m={ x=[69328, 384] },
  [1mproduct[0m={ x=[1941, 640] },
  [1m(user, reviews, product)[0m={
    edge_index=[2, 57338],
    edge_label=[7167],
    edge_label_index=[2, 7167]
  },
  [1m(product, rev_reviews, user)[0m={ edge_index=[2, 57338] }
)


#### Balancing

In [66]:
torch.bincount(edge_label)

tensor([    0,  5920,  3074,  5112, 11032, 46534])

In [67]:
# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.

weight = torch.bincount(train_data['user', 'product'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

## Model building

In [73]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
    
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['product'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [69]:
model = Model(hidden_channels=32).to(device)

# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.collect('x'), train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [70]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.collect('x'), train_data.edge_index_dict,
                 train_data['user', 'product'].edge_label_index)
    target = train_data['user', 'product'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [71]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.collect('x'), data.edge_index_dict,
                 data['user', 'product'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'product'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [74]:
for epoch in range(1, 301):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

x_dict: {'user': tensor([[-0.0863,  0.0119, -0.0082,  ..., -0.0091, -0.0447,  0.0359],
        [-0.0401, -0.0066, -0.0254,  ...,  0.0582,  0.0009, -0.0501],
        [-0.0694,  0.0126,  0.0188,  ..., -0.0365,  0.0344, -0.0129],
        ...,
        [-0.0399,  0.0636, -0.0150,  ..., -0.0148,  0.0746,  0.0414],
        [-0.0711,  0.0623, -0.0726,  ..., -0.0267,  0.0121,  0.0176],
        [-0.0866,  0.0664,  0.0134,  ...,  0.0572,  0.0412, -0.0983]]), 'product': tensor([[-0.0697, -0.0069, -0.0289,  ..., -0.0987,  0.4094, -0.9409],
        [-0.0697, -0.0069, -0.0289,  ...,  0.0323,  0.7607, -0.3240],
        [-0.0697, -0.0069, -0.0289,  ..., -0.5236,  0.4073, -0.9528],
        ...,
        [-0.0230,  0.0385,  0.0627,  ...,  0.1958, -0.0544, -0.2548],
        [-0.0230,  0.0385,  0.0627,  ...,  0.3844, -0.0763, -0.1142],
        [-0.0230,  0.0385,  0.0627,  ..., -0.0090, -0.0380, -0.0968]])}
edge_index_dict: {('user', 'reviews', 'product'): tensor([[15328, 30369, 12649,  ..., 18690, 21984, 22