In [None]:
# Mounting Google Colab drive
from google.colab import drive
drive.mount('/content/drive')

# Imports
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, recall_score, precision_score
from joblib import Parallel, delayed
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle

# Installing required packages
!pip install torch-geometric

import torch_geometric.transforms as T
from torch_geometric.nn import HeteroConv, SAGEConv, Linear
from torch_geometric.data import HeteroData
from torch.autograd import Variable

# Change directory to location
loc = "/content/drive/MyDrive/KE_GNN/"
os.chdir(loc)
os.getcwd()

#number of epochs
epoch_n = 211
#sample number
run_number = 1

def move_to_device(obj, device):
  '''
  moves a dictionary to device (if needed)
  '''
  if isinstance(obj, torch.Tensor):
      return obj.to(device)
  elif isinstance(obj, dict):
      return {k: move_to_device(v, device) for k, v in obj.items()}
  elif isinstance(obj, list):
      return [move_to_device(i, device) for i in obj]
  elif isinstance(obj, tuple):
      return tuple(move_to_device(i, device) for i in obj)
  elif isinstance(obj, set):
      return {move_to_device(i, device) for i in obj}
  else:
      return obj

# load train graph and clause dictionary
data_train = torch.load('{}Graph storage/train_graph.pt'.format(loc))
with open('{}Clause Storage/train_KE_location_large.pkl'.format(loc), 'rb') as handle:
    train_KE_location = pickle.load(handle)

# load train graph and clause dictionary
data_valid = torch.load('{}Graph storage/valid_graph.pt'.format(loc))
with open('{}Clause Storage/valid_KE_location_large.pkl'.format(loc), 'rb') as handle:
    valid_KE_location = pickle.load(handle)

# load train graph and clause dictionary
data_test = torch.load('{}Graph storage/test_graph.pt'.format(loc))
with open('{}Clause Storage/test_KE_location_large.pkl'.format(loc), 'rb') as handle:
    test_KE_location = pickle.load(handle)

# load knowledge enhancement
with open('{}Clause Storage/Knowledge_enhancements_large.pkl'.format(loc), 'rb') as handle:
    KE_conditions = pickle.load(handle)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


keys_keep = ['RULE1', 'RULE2', 'RULE3', 'RULE4', 'RULE5']
train_KE_location = subset_dict(train_KE_location, keys_keep)
valid_KE_location = subset_dict(valid_KE_location, keys_keep)
test_KE_location = subset_dict(test_KE_location, keys_keep)
KE_conditions = subset_dict(KE_conditions, keys_keep)

def f1_finder(pred, true, max_val):
    '''
    Finds the best threshold for maximizing the F1-score of a binary classifier.

    Args:
        pred: Predicted values for the positive class.
        true: True binary labels.
        max_val: The maximum threshold value to consider.

    Returns:
        The threshold that maximizes the F1-score.
    '''
    thresholds = np.linspace(0, max_val, num=200, endpoint=True)

    def compute_f1(threshold):
        return f1_score(true, (pred > threshold).astype(int), zero_division=0.0)

    f1_scores = Parallel(n_jobs=-1)(delayed(compute_f1)(x) for x in thresholds)

    best_index = np.argmax(f1_scores)
    best_x = thresholds[best_index]
    return best_x

# Using the Heterogeneous Convolution Wrapper
class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers1, num_layers2, KE, KE_dictionary, conditions):
        super().__init__()


        self.convs1 = torch.nn.ModuleList()
        for _ in range(num_layers1):
          # first convolution networks
            conv = HeteroConv({
              ('user', 'owns', 'card'): SAGEConv((-1, -1), hidden_channels),
              ('card', 'transfer', 'transaction'): SAGEConv((-1, -1), hidden_channels),
              ('location', 'rev_happend_at', 'user'): SAGEConv((-1, -1), hidden_channels),
              ('transaction', 'rev_transfer', 'merchant'): SAGEConv((-1, -1), hidden_channels),
              ('user', 'bought', 'transaction'): SAGEConv((-1, -1), hidden_channels),
              ('user', 'bought_from', 'merchant'): SAGEConv((-1, -1), hidden_channels),
              ('merchant', 'rev_bought_from', 'user'): SAGEConv((-1, -1), hidden_channels),
              ('transaction', 'bought_in', 'location'): SAGEConv((-1, -1), hidden_channels),
              ('card', 'bought_with', 'merchant'): SAGEConv((-1, -1), hidden_channels),
              ('merchant', 'rev_bought_with', 'card'): SAGEConv((-1, -1), hidden_channels),
              }, aggr='sum')
            self.convs1.append(conv)

        self.convs2 = torch.nn.ModuleList()
        for _ in range(num_layers2):
          # second convolution networks. focusing purely on transactions
            conv = HeteroConv({
               ('card', 'transfer', 'transaction'): SAGEConv((-1, -1), hidden_channels),
              ('transaction', 'rev_transfer', 'card'): SAGEConv((-1, -1), hidden_channels),
              ('merchant', 'transfer', 'transaction'): SAGEConv((-1, -1), hidden_channels),
              ('transaction', 'rev_transfer', 'merchant'): SAGEConv((-1, -1), hidden_channels),
              ('user', 'bought', 'transaction'): SAGEConv((-1, -1), hidden_channels),
              ('transaction', 'rev_bought', 'user'): SAGEConv((-1, -1), hidden_channels),
              ('transaction', 'bought_in', 'location'): SAGEConv((-1, -1), hidden_channels),
              ('location', 'rev_bought_in', 'transaction'): SAGEConv((-1, -1), hidden_channels),
              }, aggr='sum')
            self.convs2.append(conv)
        # creating parameter dictionary for clauses
        self.params = nn.ParameterDict()
        for k, v in KE_dictionary.items():
          self.params[k] = nn.Parameter(torch.tensor([1], dtype=torch.float32))


        self.lin = Linear(hidden_channels, out_channels)
    def forward(self, x_dict, edge_index_dict, KE, KE_dictionary, conditions):
      # first convolution layer
      for conv in self.convs1:
        x_dict = conv(x_dict, edge_index_dict)
        x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}
      # second convolution layer
      for conv in self.convs2:
        x_dict = conv(x_dict, edge_index_dict)
        x_dict = {key: F.leaky_relu( x) for key, x in x_dict.items()}
      # linear layer
      no_KE_output = torch.sigmoid(self.lin(x_dict['transaction']))
      if KE:
            # creating copy of output, dictionary of clauses, outputs
            final_out = no_KE_output
            rule_outputs = []
            KE_output_dic = {}
            for k, v in conditions.items():
              #only record if there is atleast one transaction which adheres to the clause
                if torch.sum(v) > 0:
                    KE_output_dic[k] = ((self.params[k]-1) * v) * final_out
                    rule_outputs.append((k,self.params[k].item()))
                    final_out = torch.clamp(KE_output_dic[k] + final_out, min = 0, max= 1)
            return final_out, rule_outputs
      else:
          return no_KE_output

# Creating model
KE_TF = True
model = HeteroGNN(hidden_channels=168, out_channels=1, num_layers1=2,
              num_layers2 =2, KE = KE_TF, KE_dictionary = KE_conditions,
              conditions = train_KE_location)




# move to device (if needed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data_test = data_test.to(device)
data_train = data_train.to(device)
data_valid.to(device)
# move dictionaries to device (if needed)
KE_conditions = move_to_device(KE_conditions, device)
train_KE_location = move_to_device(train_KE_location, device)
valid_KE_location = move_to_device(valid_KE_location, device)
test_KE_location = move_to_device(test_KE_location, device)

# initializing model
with torch.no_grad():
    out = model(data_train.x_dict, data_train.edge_index_dict,
                KE = KE_TF, KE_dictionary = KE_conditions, conditions = train_KE_location)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()


def inductive_train():
  '''
      Performs a single training step for the model.
    Returns:
        loss: The computed loss for the current training step.
        rule_outputs: The output from clause weights
  '''
  model.train()
  optimizer.zero_grad()  # Clear gradients
  out, weights = model(data_train.x_dict, data_train.edge_index_dict,
              KE = KE_TF, KE_dictionary = KE_conditions, conditions = train_KE_location)  # Perform a single forward pass
  loss = criterion(out, data_train['transaction'].y)  # Compute the loss solely based on the training nodes
  loss.backward()
  optimizer.step()
  return loss, weights


def test():
  '''
    Test the model on the validation set.

    Returns:
        The predicted and true labels for the validation set.
  '''
  model.eval()
  out, _ = model(data_valid.x_dict, data_valid.edge_index_dict,
              KE = KE_TF, KE_dictionary = KE_conditions, conditions = valid_KE_location)
  pred = out.detach().cpu().numpy()
  true_labels = data_valid['transaction'].y.cpu()
  return pred, true_labels.numpy()



# training loop, saving best model as it continues to train to ensure that the best model is selected for predictions
f1_best = 0
f1 = 0
best_model_state = None
measures = []
weights = []
for epoch in range(1, epoch_n):
    print(epoch)
    weights_raw = []
    loss, weights_raw = inductive_train()
    weights.append(weights_raw.copy())
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
        pred, truess = test()

        threshold = f1_finder(pred, truess, 1.0)
        predss_thres = (pred > threshold).astype(int)

        f1 = f1_score(truess, predss_thres, zero_division = 0.0)
        recall = recall_score(truess, predss_thres, zero_division = 0.0)
        prc = precision_score(truess, predss_thres, zero_division = 0.0)

        print('Best current threshold:', threshold, 'Best F1 score:', f1, ' number of fraud: ', np.sum(truess))
        print(classification_report(truess, predss_thres))
        measures.append([epoch,loss,f1,threshold, recall,prc])
        if f1 > f1_best:
            print('new best model')
            f1_best = f1  # Update the best F1 score
            best_thresh = threshold
            #save best current model
            torch.save({'epoch': epoch,'model_state_dict': model.state_dict(),
                      'optimizer_state_dict': optimizer.state_dict(),
                      'loss': loss, }, '{}/Main Experiment/model_storage/KeGCN5_{}.pt'.format(loc,run_number))
print('final best f1: {}, best threshold: {}'.format(f1_best, best_thresh))


#extracting weight data
extracted_data = []
# Extract tensor values from the list
for rule_set in weights:
    row = {rule: param for rule, param in rule_set}
    extracted_data.append(row)

# Dataframe containing extracted weights
df3 = pd.DataFrame(extracted_data)

# re-create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HeteroGNN(hidden_channels=168, out_channels=1, num_layers1=2,
                num_layers2 = 2, KE = KE_TF, KE_dictionary = KE_conditions,
                conditions = test_KE_location)


# move to device if needed
model.to(device)
data_train.to(device)
KE_conditions = move_to_device(KE_conditions, device)
train_KE_location = move_to_device(train_KE_location, device)
valid_KE_location = move_to_device(valid_KE_location, device)


criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#load best performing model
checkpoint = torch.load('{}/Main Experiment/model_storage/KeGCN5_{}.pt'.format(loc,run_number))

# Load the model and optimizer state dictionaries
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# Set the model to evaluation mode
model.eval()

test_measure = []
out,_ = model(data_test.x_dict, data_test.edge_index_dict, KE_TF, KE_conditions, test_KE_location)
pred = out.detach().cpu().numpy()
true_labels = data_test['transaction'].y.cpu()

pred = out.detach().cpu().numpy()
true_labels = data_test['transaction'].y.cpu()
# predictions using the best threshold found in the best model
predss_thres = (pred > best_thresh).astype(int)
best_validation_threshold = best_thresh.copy()
# f1 score from the test
f1_val_thresh_test_set = f1_score(true_labels, predss_thres)
print('validation optimised test results:')
print(classification_report(true_labels, predss_thres))
recall_val_thres_test = recall_score(true_labels, predss_thres)
precision_val_thres_test = precision_score(true_labels, predss_thres)

# test optimised results
threshold = f1_finder(pred, true_labels, 1.0)
test_optimised_prediction = (pred > threshold).astype(int)

f1_test_optimised = f1_score(true_labels, test_optimised_prediction, zero_division = 0.0)
recall_test_optimised = recall_score(true_labels, test_optimised_prediction, zero_division = 0.0)
precision_test_optimised = precision_score(true_labels, test_optimised_prediction, zero_division = 0.0)

test_measure.append([f1_val_thresh_test_set, best_validation_threshold, recall_val_thres_test, precision_val_thres_test,
                   f1_test_optimised, threshold, recall_test_optimised, precision_test_optimised])

# final model results
df2 = pd.DataFrame(test_measure, columns = ['test_train_thresh_f1', 'test_train_thresh', 'test_train_recall', 'test_train_precision',
                                          'test_f1','test_thresh', 'test_recall', 'test_precision'])
#training results
df1 = pd.DataFrame(measures, columns=['epoch', 'training loss', 'optimised_f1','threshold', 'recall', 'precision'])

# save model
df1.to_csv('{}/Main Experiment/output/KeGCN_5_clause/KE_GCN_training_results_{}.csv'.format(loc, run_number))
df2.to_csv('{}/Main Experiment/output/KeGCN_5_clause/KE_GCN_test_results_{}.csv'.format(loc, run_number))
df3.to_csv('{}/Main Experiment/output/KeGCN_5_clause/KE_GCN_train_weights_{}.csv'.format(loc, run_number))





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
1
2
3
4
5
6
7
8
9
10
Epoch: 010, Loss: 0.0266
Best current threshold: 0.0 Best F1 score: 0.002464221182472929  number of fraud:  3209.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00   2598056
         1.0       0.00      1.00      0.00      3209

    accuracy                           0.00   2601265
   macro avg       0.00      0.50      0.00   2601265
weighted avg       0.00      0.00      0.00   2601265

new best model
final best f1: 0.002464221182472929, best threshold: 0.0
validation optimised test results:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00   2598314
         1.0       0.00      1.00      0.00      2951

    accuracy                           0.00   2601265
   macro avg       0.00      0.50      0.00   2601265
weighted avg       0.00      0.00      0.00   2601265



In [None]:
#extracting weight data
extracted_data = []
# Extract tensor values from the list
for rule_set in weights:
    row = {rule: param for rule, param in rule_set}
    extracted_data.append(row)

# Dataframe containing extracted weights
df3 = pd.DataFrame(extracted_data)

# re-create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HeteroGNN(hidden_channels=168, out_channels=1, num_layers1=2,
                num_layers2 = 2, KE = KE_TF, KE_dictionary = KE_conditions,
                conditions = test_KE_location)


# move to device if needed
model.to(device)
data_train.to(device)
KE_conditions = move_to_device(KE_conditions, device)
train_KE_location = move_to_device(train_KE_location, device)
valid_KE_location = move_to_device(valid_KE_location, device)


criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#load best performing model
checkpoint = torch.load('{}/model_storage/KeGCN10_{}.pt'.format(loc,run_number))

# Load the model and optimizer state dictionaries
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# Set the model to evaluation mode
model.eval()

out,_ = model(data_test.x_dict, data_test.edge_index_dict, KE_TF, KE_conditions, test_KE_location)
pred = out.detach().cpu().numpy()
true_labels = data_test['transaction'].y.cpu()

# predictions using the best threshold found in the best model
predss_thres = (pred > best_thresh).astype(int)
best_validation_threshold = best_thresh.copy()
# f1 score from the test
f1_val_thresh_test_set = f1_score(true_labels, predss_thres)
print('validation optimised test results:')
print(classification_report(true_labels, predss_thres))
recall_val_thres_test = recall_score(true_labels, predss_thres)
precision_val_thres_test = precision_score(true_labels, predss_thres)

# test optimised results
threshold = f1_finder(pred, true_labels, 1.0)
test_optimised_prediction = (pred > threshold).astype(int)

f1_test_optimised = f1_score(true_labels, test_optimised_prediction, zero_division = 0.0)
recall_test_optimised = recall_score(true_labels, test_optimised_prediction, zero_division = 0.0)
precision_test_optimised = precision_score(true_labels, test_optimised_prediction, zero_division = 0.0)

test_measure.append([f1_val_thresh_test_set, best_validation_threshold, recall_val_thres_test, precision_val_thres_test,
                   f1_test_optimised, threshold, recall_test_optimised, precision_test_optimised])

# final model results
df2 = pd.DataFrame(test_measure, columns = ['test_train_thresh_f1', 'test_train_thresh', 'test_train_recall', 'test_train_precision',
                                          'test_f1','test_thresh', 'test_recall', 'test_precision'])
#training results
df1 = pd.DataFrame(measures, columns=['epoch', 'training loss', 'optimised_f1','threshold', 'recall', 'precision'])




validation optimised test results:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00   2598314
         1.0       0.00      1.00      0.00      2951

    accuracy                           0.00   2601265
   macro avg       0.00      0.50      0.00   2601265
weighted avg       0.00      0.00      0.00   2601265



OSError: Cannot save file into a non-existent directory: 'KE_GCN/KeGCN_10_clause'

In [None]:
torch.save({'epoch': epoch,'model_state_dict': model.state_dict(),
                      'optimizer_state_dict': optimizer.state_dict(),
                      'loss': loss, }, '{}/model_storage/KeGCN10_{}.pt'.format(loc,run_number))

In [None]:
loc = "/content/drive/MyDrive/KE_GNN/Main Experiment"
