In [None]:
!mamba install pytorch=1.12 torchvision torchaudio cudatoolkit=11.3 -c pytorch -y -q

In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-geometric

In [None]:
!mamba install -c conda-forge pyts -q -y

In [None]:
!pip install llvmpy
!pip install cython
!pip install numba
!pip install pandas
!pip install networkx
!pip install matplotlib

In [1]:
import pandas as pd
import numpy as np
import torch
import warnings

from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from pyts.image import MarkovTransitionField

from torch_geometric.utils.convert import to_networkx
import networkx as nx
from matplotlib import pyplot
import matplotlib.pyplot as plt

In [2]:
#Fix X weird shape
def X_repair(X_temp):
    X = np.empty([len(X_temp)-1,len(X_temp[0])], dtype='float64')
    k = 0
    for i in range(len(X_temp)):
        if i != 21019:
            for j in range(len(X[0])):
                X[i-k][j] = X_temp[i][j]
        else:
            k = 1
    return X

In [3]:
#usualy there was a sample with length not equal to the others, that is why shape is weird and we need X_repair
def anomaly_finder(X_temp):
    for i in range(len(X_temp)):
        if len(X_temp[i]) != len(X_temp[0]):
            print(i)

In [4]:
# maps every utility number and number location in anomaly_numeric()
def location_of_labels(Y_temp):
    range_labels = np.empty([21-1,3], dtype='<U21')
    t = 'n'
    count1 = 0
    count2 = 0
    k = 0
    for i in range(len(Y_temp)):
        if i != 21019:
            if t != Y_temp[i]:
                range_labels[count1][0] = count1
                range_labels[count1][1] = Y_temp[i]
                range_labels[count1][2] = count2
                t = Y_temp[i]
                count1 += 1
            count2 += 1
    return range_labels

In [5]:
# converts words to numbers
def anomaly_numeric(Y):
    labels = np.empty([len(Y)-1,1], dtype=int)
    t = Y[0]
    k = 0
    count = 0
    for i in range(len(Y)):
        if i != 21019:
            if Y[i] != t:
                count += 1
                t = Y[i]
            labels[i-k] = count
        else:
            k = 1
            
    return labels.reshape(1,-1)[0]

In [6]:
# creates the graph 
def create_graph(n_bins = 150):
    
    warnings.filterwarnings("ignore")
    global lol, output
    
    #X_temp is original dataset
    X_temp = np.load(dataFileName, mmap_mode=None, allow_pickle=True)
    
    #X is the dataset that we need
    X = X_repair(X_temp)
    
    #Y_temp is original dataset
    Y_temp = np.load(labelsFileName, mmap_mode=None, allow_pickle=False)
    
    #Y is the dataset that we need
    Y = anomaly_numeric(Y_temp)
    
    #for visualisation of what is where in the dataset
    lol = location_of_labels(Y_temp)
        
    MTF = MarkovTransitionField(n_bins = n_bins)
    X_gaf = MTF.fit_transform(X)
    output = []
    
    from sklearn.utils import class_weight
    global class_weights
    class_weights = torch.tensor(class_weight.compute_class_weight(class_weight='balanced',
                                                                   classes=np.unique(Y),
                                                                   y=Y))
    def adjToEdgidx(adj_mat):
        edge_index = torch.from_numpy(adj_mat).nonzero().t().contiguous()
        row, col = edge_index
        edge_weight = adj_mat[row, col]#adj_mat[row, col]
        return edge_index, edge_weight
    
    for i, j in enumerate(X_gaf):
        edge_index, edge_weight = adjToEdgidx(j)
        #Into Data save node values "x", edge index from adjacency matrix and edge features/attributes, finally labels       
        output.append(Data(x=torch.unsqueeze(torch.tensor(X[i], dtype=torch.double),1), 
                           edge_index=edge_index, 
                           edge_attr=torch.unsqueeze(torch.tensor(edge_weight, dtype=torch.double),1), 
                           y=torch.tensor(Y[i], dtype=torch.long)))
    
    return output

In [7]:
## Graph Clasification

In [8]:
import torch
import torch.nn.functional as F
from torch.nn import Linear, CrossEntropyLoss
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool, ChebConv, global_sort_pool
from torch.nn import Sequential, BatchNorm1d, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GINConv, GINEConv, GATv2Conv


class GINE(torch.nn.Module):
    """GIN"""
    def __init__(self, dim_h):
        super(GINE, self).__init__()
        edge_dim = 1
        
        
        
        self.conv1 = GINEConv(
            Sequential(Linear(dim_h, dim_h),
                       BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv2 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv4 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv3 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.lin1 = Linear(dim_h*4, dim_h*4)
        self.lin2 = Linear(dim_h*4, 20)

    def forward(self, data):
        
        x, edge_index, edge_weight, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        # Node embeddings 
        h1 = self.conv1(x, edge_index, edge_attr=edge_weight)
        h2 = self.conv2(h1, edge_index, edge_attr=edge_weight)
        h4 = self.conv2(h2, edge_index, edge_attr=edge_weight)
        h3 = self.conv3(h4, edge_index, edge_attr=edge_weight)

        # Graph-level readout
        h1 = global_max_pool(h1, batch)
        h2 = global_max_pool(h2, batch)
        h3 = global_max_pool(h3, batch)
        h4 = global_max_pool(h4, batch)
        

        # Concatenate graph embeddings
        h = torch.cat((h1, h2, h3, h4), dim=1)

        # Classifier
        h = self.lin1(h)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.lin2(h)
        
        return h

In [9]:
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix

def trainG(model, loader, epoch, optimizer, device):
    model.train()
    total_loss = 0

    with tqdm(loader, unit="batch") as tepoch:
         for data in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            optimizer.zero_grad()
            data = data.to(device)
            out = model(data)
            loss_function = CrossEntropyLoss(weight=class_weights.to(device))
            loss = loss_function(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

def testG(model, loader, device):
    model.eval()

    correct = 0
    predicted_categories = []
    true_categories = []
    with tqdm(loader, unit="batch") as tepoch:
        for data in tepoch:
      # Iterate in batches over the training/test dataset.
            data = data.to(device)
            out = model(data)  
            pred = out.argmax(dim=1)# Use the class with highest probability.
            predicted_categories.append(pred.cpu().detach().numpy())
            true_categories.append(data.y.cpu().detach().numpy())
            correct += int((pred == data.y).sum())# Check against ground-truth labels.
    print(confusion_matrix(true_categories, predicted_categories))
    print(classification_report(true_categories, predicted_categories))  
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.


In [10]:
def graph_classification(n_bins = 150, batch_size = 64, range_epoch = 10, lr=0.01):
    
    output = create_graph(n_bins = n_bins)

    torch.manual_seed(6406)
    train_size = int(0.8 * len(output))
    test_size = len(output) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(output, [train_size, test_size])
    loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    #________________Select model_________________________________________________
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.empty_cache()

    model = GINE(32).double().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

    for epoch in range(range_epoch):
        result = trainG(model, loader, epoch, optimizer, device)

    print("Done!")
    score = testG(model, DataLoader(test_dataset, batch_size = 1), device)

In [None]:
dataFileName = 'SynD_data_60.npy'
labelsFileName = 'SynD_labels_60.npy'

In [None]:
graph_classification(n_bins = 200, batch_size = 64, range_epoch = 10, lr=0.01)