In [None]:
!mamba install pytorch=1.12 torchvision torchaudio cudatoolkit=11.3 -c pytorch -y -q

In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
!pip install torch-geometric

In [None]:
!mamba install -c conda-forge pyts -q -y

In [None]:
!pip install llvmpy
!pip install cython
!pip install numba
!pip install pandas
!pip install networkx
!pip install matplotlib

In [1]:
import pandas as pd
import numpy as np
import torch
import warnings
import torch.nn.functional as F
import sklearn
import os.path as osp
import matplotlib.pyplot as plt

from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from pyts.image import MarkovTransitionField
from sklearn.utils import class_weight

from torch.nn import Linear, CrossEntropyLoss
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool, ChebConv, global_sort_pool
from torch_geometric.loader import DataLoader
from torch.nn import Sequential, BatchNorm1d, ReLU, Dropout
from torch_geometric.nn import GCNConv, GINConv, GINEConv, GATv2Conv, GATConv

from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.manifold import TSNE

In [18]:
#function for graph creation
def create_graph(main_path, path_properties = None, path_mask = None, classif_type = 0):
    warnings.filterwarnings("ignore")
    # preparation for un/cut graphs
    if path_properties == None and path_mask == None: #
    
        df = pd.read_csv(main_path)  
        del df['Unnamed: 0']
        df.index, df.columns = [range(df.index.size), range(df.columns.size)]
        length_rss = int((df.columns.stop-2)/2)
        
        X = df.loc[:,df.columns[:length_rss]].to_numpy()
        Y = df[length_rss+1].to_numpy(dtype=np.uint8)
        X_mask = df.loc[:,df.columns[length_rss+2:]].to_numpy()
        
        MTF = MarkovTransitionField(n_bins=length_rss)
        X_gaf = MTF.fit_transform(X)
        
    # preparation for random graphs
    else:
        dataset_rss = np.load(main_path, allow_pickle=True)['arr_0']
        dataset_properties = np.load(path_properties, allow_pickle=True)['arr_0']
        dataset_mask = np.load(path_mask, allow_pickle=True)['arr_0']

        for i in range(len(dataset_properties)):
            if  dataset_properties[i,1] == True:
                dataset_properties[i,1] = 1
            else:
                dataset_properties[i,1] = 0
        
        X = dataset_rss * (-1)
        X_mask = dataset_mask
        Y = dataset_properties[:,2]
        Y_len = dataset_properties[:,0]

        X_gaf = []
        for i in range(len(Y_len)):
            
            MTF = MarkovTransitionField(n_bins=Y_len[i])
            X_gaf_temp = MTF.fit_transform(X[i].reshape(1, -1))
            X_gaf.append(X_gaf_temp[0])
    
    #output will have all graphs 
    output = []
    
    #setting class_weights for graph
    global class_weights
    class_weights = torch.tensor(class_weight.compute_class_weight(class_weight='balanced',
                                                                   classes=np.unique(Y),
                                                                   y=Y))
    #function for creating edge index and edge weight for a given MTF matrix
    def adjToEdgidx(adj_mat):
        edge_index = torch.from_numpy(adj_mat).nonzero().t().contiguous()
        row, col = edge_index
        edge_weight = adj_mat[row, col]#adj_mat[row, col]
        return edge_index, edge_weight
    
    for i, j in enumerate(X_gaf):
        edge_index, edge_weight = adjToEdgidx(j)
        #Into Data save node values "x", edge index from adjacency matrix and edge features/attributes, finally labels
        
        if classif_type == 0: ##for graph classification
            y_mask = torch.tensor(Y[i], dtype=torch.long)      
        else:                 ##for node classification 
            y_mask = torch.unsqueeze(torch.tensor(X_mask[i], dtype=torch.double),1)
            
        output.append(Data(x=torch.unsqueeze(torch.tensor(X[i], dtype=torch.double),1), edge_index=edge_index, edge_attr=torch.unsqueeze(torch.tensor(edge_weight, dtype=torch.double),1), y=y_mask))
    
    return output

In [None]:
## Graph Clasification

In [4]:
#model definition for graph classification
class GINE(torch.nn.Module):
    """GIN"""
    def __init__(self, dim_h):
        super(GINE, self).__init__()
        edge_dim = 1
        
        self.conv1 = GINEConv(
            Sequential(Linear(dim_h, dim_h),
                       BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv2 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv3 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv4 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.conv5 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()), edge_dim=edge_dim)
        
        self.lin1 = Linear(dim_h*5, dim_h*5)
        self.lin2 = Linear(dim_h*5, 5)

    def forward(self, data):
        
        x, edge_index, edge_weight, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        # Node embeddings 
        h1 = self.conv1(x, edge_index, edge_attr=edge_weight)
        h2 = self.conv2(h1, edge_index, edge_attr=edge_weight)
        h3 = self.conv3(h2, edge_index, edge_attr=edge_weight)
        h4 = self.conv4(h3, edge_index, edge_attr=edge_weight)
        h5 = self.conv5(h4, edge_index, edge_attr=edge_weight)
        
        # Graph-level readout
        
        h1 = global_max_pool(h1, batch)
        h2 = global_max_pool(h2, batch)
        h3 = global_max_pool(h3, batch)
        h4 = global_max_pool(h4, batch)
        h5 = global_max_pool(h5, batch)
        

        # Concatenate graph embeddings
        h = torch.cat((h1, h2, h3, h4, h5), dim=1)

        # Classifier
        h = self.lin1(h)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.lin2(h)
        
        return h

In [5]:
#function for training the model
def trainG(model, loader, epoch, optimizer, device):
    model.train()
    total_loss = 0

    with tqdm(loader, unit="batch") as tepoch:
         for data in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            optimizer.zero_grad()
            data = data.to(device)
            out = model(data)
            
            #define loss_function
            loss_function = CrossEntropyLoss(weight=class_weights.to(device))
            loss = loss_function(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

#function for testing the model
def testG(model, loader, device):
    model.eval()

    correct = 0
    predicted_categories = []
    true_categories = []
    with tqdm(loader, unit="batch") as tepoch:
        for data in tepoch:
        #Iterate in batches over the training/test dataset.
            data = data.to(device)
            out = model(data)  
            
            #Use the class with highest probability.
            pred = out.argmax(dim=1)
            predicted_categories.append(pred.cpu().detach().numpy())
            true_categories.append(data.y.cpu().detach().numpy())
            
            #Check against ground-truth labels.
            correct += int((pred == data.y).sum())
    print(confusion_matrix(true_categories, predicted_categories))
    print(classification_report(true_categories, predicted_categories))  
    
    #Derive ratio of correct predictions.
    return correct / len(loader.dataset)  


In [None]:
def graph_classification(main_path, path_properties = None, path_mask = None, range_epoch = 10):
    
    #calling the graph creator function
    output = create_graph(main_path, path_properties, path_mask)
    torch.manual_seed(6406)
    
    #setting train and test sizes
    train_size = int(0.8 * len(output))
    test_size = len(output) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(output, [train_size, test_size])
    loader = DataLoader(train_dataset, batch_size = 64, shuffle = True)
    
    #setting device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.empty_cache()
    
    #creating model
    model = GINE(32).double().to(device)
    
    #selecting the optimiser
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    #run for number of epochs
    for epoch in range(range_epoch):
        result = trainG(model, loader, epoch, optimizer, device)

    print("Done!")
    score = testG(model, DataLoader(test_dataset, batch_size = 1), device)

In [None]:
## Node Clasification

In [8]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = GATConv(1, 4, heads=4)
        self.lin1 = torch.nn.Linear(1, 16)
        self.conv2 = GATConv(16, 4, heads=4)
        self.lin2 = torch.nn.Linear(16, 16)
        self.conv3 = GATConv(16, 1, heads=6,concat=False)
        self.lin3 = torch.nn.Linear(16, 1)
          
    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index) + self.lin1(x))
        x = F.elu(self.conv2(x, edge_index) + self.lin2(x))
        x = self.conv3(x, edge_index) + self.lin3(x)
        return x

In [9]:
#function for training and validating the model
def trainN(model, epoch, optimizer, device):
    model.train()

    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        loss = loss_op(model(data.x.float(), data.edge_index), data.y)
        total_loss += loss.item() * len(train_loader)
        loss.backward()
        optimizer.step()
    return total_loss / len(train_loader.dataset)

@torch.no_grad()
#function for testing the model
def testN(model, loader, device):
    model.eval()

    ys, preds = [], []
    for data in loader:
        ys.append(data.y)
        out = model(data.x.float().to(device), data.edge_index.to(device))
        preds.append((out > 0).float().cpu())  
        
    y, pred = torch.cat(ys, dim=0).numpy(), torch.cat(preds, dim=0).numpy()
    return f1_score(y, pred, average='micro') if pred.sum() > 0 else 0, ys,preds

In [14]:
def node_classification(main_path, path_properties = None, path_mask = None, range_epoch = 10):
    
    #calling the graph creator function
    output = create_graph(main_path, path_properties, path_mask, classif_type = 1)
    torch.manual_seed(6406)
    
    #setting train, val and test sizes
    train_size = int(0.6 * len(output))
    Temp_size = len(output) - train_size
    val_size = int(0.5*Temp_size)
    test_size = Temp_size - val_size
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(output, [train_size, val_size, test_size])

    global train_loader, val_loader, test_loader, loss_op
    
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
    
    #setting device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.empty_cache()

    #creating model
    model = Net().to(device)
    #define loss
    loss_op = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    #run for number of epochs
    for epoch in range(range_epoch):
        loss = trainN(model, epoch, optimizer, device)
        val_f1,ys,preds = testN(model, val_loader, device)
        test_f1,ys,preds = testN(model, test_loader, device)

        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_f1:.4f}, 'f'Test: {test_f1:.4f}')
        
    print(sklearn.metrics.multilabel_confusion_matrix(ys[0], preds[0]))
    print(sklearn.metrics.classification_report(ys[0], preds[0]))

In [None]:
def run_pypeline(classif, main_path, path_properties = None, path_mask = None, range_epoch = 10):
    if classif == "graph"
        node_classification(main_path, path_properties, path_mask, range_epoch = 10)
    
    if classif == "node"
        graph_classification(main_path, path_properties, path_mask, range_epoch = 10)

In [11]:
# after you running all functions above you have one main functions to call:

#     - run_pypeline(classif == either "graph" or "node", what kind of classification you want,
#                    main_path == .csv, path of the cut/uncut dataset,
#                    path_properties == .npz, fill this, if you want the cut dataset, with properties, otherwies leave blank for uncut,
#                    path_mask ==  .npz , fill this, if you want the cut dataset, with mask, otherwies leave blank for uncut,
#                    range_epoch == 10, for length of epoch)


In [None]:
run_pypeline(classif = "node, "main_path = "dataset_cut.csv", range_epoch = 50)