In [1]:
import os
import random
import gc

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
import torch
from torch_geometric.data import DataLoader
from torch_geometric.nn import HeteroConv, GATConv, Linear, SAGEConv, to_hetero
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Dropout
from torch.nn.functional import relu, tanh, softmax
import torch_geometric.transforms as T

In [4]:
from EnhancedGATModel import EnhancedGATModel

In [5]:
from IPython.core.debugger import set_trace

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
def LoadGraph(file_name):
    graph = torch.load(file_name)
    graph['pending_transaction'].x = graph['pending_transaction'].x.unsqueeze(0)
    return graph

In [45]:
def feature_scale(model_out):
    min = model_out.min()
    max = model_out.max()
    result = (model_out - min) / (max - min)
    return result

In [54]:
fraud = torch.tensor([[1., 0.]], dtype=torch.float)
legit = torch.tensor([[0., 1.]], dtype=torch.float)
fraud=fraud.to(device)
legit=legit.to(device)
def validate(model, valid_data, negative_bias=0):
    class_weight = torch.tensor([[negative_bias,0]])
    class_weight = class_weight.to(device)
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0
    avg_total_loss = 0.0
    true_positive, true_negative, false_positive, false_negative = 0,0,0,0
    
    with torch.no_grad():
        for file_path in valid_data:
            graph = LoadGraph(file_path)
            graph = graph.to(device)

            try:                
                out = feature_scale(model(graph.x_dict, graph.edge_index_dict))# - class_weight
                target = fraud if graph.y==1 else legit
                loss = F.binary_cross_entropy_with_logits(out, target)
                print((out,target,loss))
                #print(out.shape)
            except Exception as e:
                print(f'\nFile:{file_path}, error {e}')
                continue;
                set_trace()

            else:
                total_loss += loss.item()
                total += target.size(0)                
                
                predicted = torch.argmax(out, dim=1)
                actual = torch.argmax(target, dim=1)
                #print(f'out={out}, target={target}, predicted={predicted}, actual={actual}')
                true_positive += ((predicted == 0) & (actual == 0)).sum().item()
                true_negative += ((predicted == 1) & (actual == 1)).sum().item()
                false_positive += ((predicted == 0) & (actual == 1)).sum().item()
                false_negative += ((predicted == 1) & (actual == 0)).sum().item()

                correct += (predicted == actual).sum().item()
                
            if total%10 == 0:
                print(".", end="")
            del graph
        gc.collect()

    results_dict = {
        "True Positives": true_positive,
        "True Negatives": true_negative,
        "False Positives": false_positive,
        "False Negatives": false_negative
    }

    accuracy = correct / total
    average_loss = total_loss / total
    return accuracy, average_loss, results_dict

In [9]:
graph_root_directory = '/AiDev/data/Set4'

In [10]:
graphs = pd.read_csv('answer_key_Set4.csv')

In [11]:
fraud_set = graphs[graphs['is_fraud']==1.0]
legit_set = graphs[graphs['is_fraud']==0.0]
print(f'graphs size: {len(graphs)}, frauds: {len(fraud_set)}, legit_set: {len(legit_set)}')

graphs size: 82128, frauds: 27376, legit_set: 54752


In [12]:
tiny_fraud = fraud_set.sample(frac=.05)
tiny_legit = legit_set.sample(n=len(tiny_fraud))
tiny_set = pd.concat([tiny_fraud, tiny_legit])
#tiny_set = tiny_fraud
#print(len(tiny_fraud), len(tiny_set))
print(len(tiny_fraud), len(tiny_legit), len(tiny_set))

1369 1369 2738


In [13]:
final_set = tiny_set['file_path']

In [14]:
train_set, valid_set = train_test_split(final_set, test_size=0.20, random_state=85)

In [15]:
#model = EnhancedGATModel(hidden_channels=24, out_channels=2)
model = torch.load("model_EnhancedGAT_7007")
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=.001)
model = model.to(device)

In [16]:
epochs=1
train_show = False

In [55]:
accuracy, average_loss, results_dict = validate(model, valid_set, negative_bias=1.5)
print(f'Validation Accuracy: {accuracy:.4f}, Validation Loss: {average_loss:.4f}, \n{results_dict}')

(tensor([[1., 0.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[0., 1.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(1.0032, device='cuda:0'))
(tensor([[1., 0.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[1., 0.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[1., 0.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[0., 1.]], device='cuda:0'), tensor([[0., 1.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[1., 0.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[0., 1.]], device='cuda:0'), tensor([[0., 1.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[1., 0.]], device='cuda:0'), tensor([[1., 0.]], device='cuda:0'), tensor(0.5032, device='cuda:0'))
(tensor([[1., 0.]], device='