# GNN Training and Encoding

* Train a GNN based on enriched features in an unsupervised fashion, and use the resulting model to encode the input features.

## Load Data

In [None]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [None]:
import os
import pandas as pd

dataset_names = ["train", "test"]
df_feats = {}
df_edges = {}
for ds_name in dataset_names:
    # Get feature and class
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_normalized.csv")
    df = pd.read_csv(file_name, index_col=0)
    # Drop irrelevant columns
    df = df.drop(columns=["Currency_Country",
                          "Beneficiary_BIC",
                          "Currency",
                          "Receiver_BIC",
                          "Sender_BIC"])  
    df_feats[ds_name] = df
    # Get edge map
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_edgemap.csv")
    df = pd.read_csv(file_name, header=None)
    # Add column names to the edge map
    df.columns = ["UETR_1", "UETR_2"]
    df_edges[ds_name] = df

## Prepared Data for Unsupervised GNN Training

In [None]:
import numpy as np
import torch

node_ids = {}
node_features = {}
edge_indices = {}
weights = {}
labels = {}

for ds_name in dataset_names:
    df_feat_class = df_feats[ds_name]
    df_edge = df_edges[ds_name]

    # Sort the data by UETR
    df_feat_class = df_feat_class.sort_values(by="UETR").reset_index(drop=True)

    # Generate UETR-index map with the feature list
    node_id = df_feat_class["UETR"].values
    map_id = {j: i for i, j in enumerate(node_id)}  # mapping nodes to indexes
    node_ids[ds_name] = node_id
    
    # Get class labels
    labels[ds_name] = df_feat_class["Class"].values

    # Map UETR to indexes in the edge map
    edges = df_edge.copy()
    edges.UETR_1 = edges.UETR_1.map(map_id)
    edges.UETR_2 = edges.UETR_2.map(map_id)
    edges = edges.astype(int)

    # for undirected graph
    edge_index = np.array(edges.values).T
    edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
    edge_indices[ds_name] = edge_index
    weights[ds_name] = torch.tensor([1] * edge_index.shape[1], dtype=torch.float)

    # UETR mapped to corresponding indexes, drop UETR and class
    node_feature = df_feat_class.drop(["UETR", "Class"], axis=1).copy()
    node_feature = torch.tensor(np.array(node_feature.values), dtype=torch.float)
    node_features[ds_name] = node_feature


## Unsupervised GNN Training

In [None]:
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch_geometric.data import Data
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import GraphSAGE

output_dir = os.path.join(site_input_dir, site_name)
DEVICE = "cuda:0"
writer = SummaryWriter(output_dir)
epochs = 100

# Converting data to PyG graph data format
train_data = Data(
    x=node_features['train'], edge_index=edge_indices['train'], edge_attr=weights['train']
)

# Define the dataloader for graphsage training
loader = LinkNeighborLoader(
    train_data,
    batch_size=2048,
    shuffle=True,
    neg_sampling_ratio=1.0,
    num_neighbors=[10, 10],
    num_workers=6,
    persistent_workers=True,
)

# Model
model = GraphSAGE(
    in_channels=node_features['train'].shape[1],
    hidden_channels=64,
    num_layers=2,
    out_channels=64,
)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
model.to(DEVICE)

for epoch in range(1, epochs + 1):
    model.train()
    running_loss = instance_count = 0

    for data in loader:
        # get the inputs data
        data = data.to(DEVICE)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        h = model(data.x, data.edge_index)
        h_src = h[data.edge_label_index[0]]
        h_dst = h[data.edge_label_index[1]]
        link_pred = (h_src * h_dst).sum(dim=-1)  # Inner product.
        loss = F.binary_cross_entropy_with_logits(link_pred, data.edge_label)
        loss.backward()
        optimizer.step()
        # add record
        running_loss += float(loss.item()) * link_pred.numel()
        instance_count += link_pred.numel()
    print(f"Epoch: {epoch:02d}, Loss: {running_loss / instance_count:.4f}")
    writer.add_scalar("train_loss", running_loss / instance_count, epoch)

# Save the model
torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))

## GNN Inference - Encoding the Raw Feature

In [None]:
# Load the model and perform inference / encoding
model_enc = GraphSAGE(
    in_channels=node_features['train'].shape[1],
    hidden_channels=64,
    num_layers=2,
    out_channels=64,
)
model_enc.load_state_dict(torch.load(os.path.join(output_dir, "model.pt")))
model_enc.eval()

embeds = {}
# Perform encoding
for ds_name in dataset_names:
    h = model_enc(node_features[ds_name], edge_indices[ds_name])
    embed = pd.DataFrame(h.cpu().detach().numpy())
    # Add column names as V_0, V_1, ... V_63
    embed.columns = [f"V_{i}" for i in range(embed.shape[1])]
    # Concatenate the node ids and class labels with the encoded features
    embed["UETR"] = node_ids[ds_name]
    embed["Class"] = labels[ds_name]
    # Move the UETR and Class columns to the front
    embed = embed[["UETR", "Class"] + [col for col in embed.columns if col not in ["UETR", "Class"]]]
    embed.to_csv(os.path.join(output_dir, f"{ds_name}_embedding.csv"), index=False)
    embeds[ds_name] = embed

In [None]:
! tree /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

In [None]:
embeds["train"]

Let's go back to the [XGBoost Notebook](../xgboost.ipynb)