# Graph Classification

In [1]:
import pandas as pd
import numpy as np
import json
import torch
import os
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, Sequential, Linear, global_mean_pool
import pickle
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

from gcnHelpers import *


# Training on Multiple classes
#### please put the folders of the CVE's in the script's directory<br>then inside each folder put 2 subfolders, one for the nodes and the other for edges
##### example: folder 121 contains 4 subfolders (nodes_121, edges_121, feature_matrices, nodes_targets)<br>and feature_matrices and nodes_targets contain the npy files

In [2]:
CVES_to_train_on = ['121', '191', '401', '457']

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)
allData = []
numGraphs_List = []

torch.cuda.empty_cache()

for encClass, cve in enumerate(CVES_to_train_on):
    print(f"-----------LOADING CVE {cve}------------------")

    
    pathToEdges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}/edges_{cve}"

    with open (f'{cve}/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
        features_matrices_list = np.load(f,  allow_pickle=True)

    with open(f'{cve}/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
        nodes_targets_list = np.load(f,  allow_pickle=True)

    
    print(f"num Graphs : {features_matrices_list.shape[0]}")
    numGraphs_List.append(features_matrices_list.shape[0])


    # Reading the adjacency list of each graph
    adj_Lists = []
    for filename in os.listdir(pathToEdges):
        f = os.path.join(pathToEdges, filename)
        df = pd.read_csv(f, header=None)
        df.dropna(inplace=True)             # <---- here is the dropping
        adj_Lists.append(df)
        


    numVulnNodes = 0

    encoded = encClass
    # Reading the label of each node
    Targets_List = []
    for target in nodes_targets_list:
        numVulnNodes += sum(target)
        Targets_List.append(encoded)
        

    numNodes = 0
    # Reading the features of each node
    node_Features_List = []
    for node_feature in features_matrices_list:
        n = np.array(node_feature,dtype='int64')
        numNodes+= n.shape[0]
        node_Features_List.append(torch.tensor(n, dtype=torch.float))


    assert (len(adj_Lists) == len(Targets_List))
    assert (len(adj_Lists) == len(node_Features_List))


    # Convert the edge list to use 0-based indices
    adj_Tensors = []
    for adj in adj_Lists:
        # the documentation strictly says to convert the adjacency list to a contiguous list
        adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   


    #Iterate over each graph, make a Data object, then append to all our dataset
    cntCorruptData=0
    for i, adj in enumerate(adj_Tensors):
        d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor(Targets_List[i]))  # <--- from the documentation.
        try:
            d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
        except:
            print('DIMENSION ERROR')
            print(f"We have features for {len(node_Features_List[i])} Nodes ")
            print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
            cntCorruptData +=1
            continue
        allData.append(d)


    print(f"num nodes : {numNodes} of which {numVulnNodes} are vulnerable")
    print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
    print(f"total number of files {len(adj_Tensors)}")
    print(f"number of files to be trained on = {len(adj_Tensors) - cntCorruptData}\n\n")
   

-----------LOADING CVE 121------------------
num Graphs : 5940
num nodes : 12564642 of which 20675 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 5940
number of files to be trained on = 5940


-----------LOADING CVE 191------------------
num Graphs : 4013
num nodes : 8937258 of which 18167 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 4013
number of files to be trained on = 4013


-----------LOADING CVE 401------------------
num Graphs : 2261
num nodes : 4553383 of which 8766 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 2261
number of files to be trained on = 2261


-----------LOADING CVE 457------------------
num Graphs : 914
num nodes : 1765018 of which 4138 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 914
number of

#### Computing class weights to be used in a weighted loss function

In [None]:
total_Graphs = sum(numGraphs_List)
Class_Weights = [total_Graphs/i for i in numGraphs_List]
print(f"Weights for each class are:\n {Class_Weights}")

### Train-Val-Test-Split and data loader

In [3]:

# Train Test Split
allData_train, allData_test = train_test_split(allData, test_size=0.2, shuffle=True)

# Train Validation Split
allData_train, allData_val = train_test_split(allData_train, test_size=0.1, shuffle=True)

unique_classes = set()

for d in allData_train:
    unique_classes.add(int(d.y.numpy()))

print(f"Unique classes {unique_classes}")


#Our Data Loader
batch_size = 5
loader = DataLoader(allData_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(allData_test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(allData_val, batch_size=batch_size, shuffle=True)

Unique classes {0, 1, 2, 3}


# GCN Class
#### anything related to the architecture

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

numClasses = len(list(unique_classes))




model = GCN(hidden_channels=64, numClasses=numClasses, numFeatures=33).to(device=device)
print(model)






optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-5)
penaltyWeights = torch.tensor( Class_Weights)  # <--- THOSE WEIGHTS ARE COMPUTED MANUALLY ACCORDING TO AN EQUATION
criterion = torch.nn.CrossEntropyLoss(weight=penaltyWeights.to(device=device))






for epoch in range(1, 13):
    train(model, loader, device, optimizer, criterion)
    train_acc = test(model, loader, device)
    val_acc = test(model, val_loader, device)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Validation Acc: {val_acc:.4f}')





# Saving the model
with open('GCN.pkl', 'wb') as f:
    pickle.dump(model.cpu(), f)






GCN(
  (conv1): GCNConv(33, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(64, 4, bias=True)
)
Epoch: 001, Train Acc: 0.5620, Validation Acc: 0.5557
Epoch: 002, Train Acc: 0.7463, Validation Acc: 0.7393
Epoch: 003, Train Acc: 0.7049, Validation Acc: 0.6841
Epoch: 004, Train Acc: 0.8330, Validation Acc: 0.8059
Epoch: 005, Train Acc: 0.8266, Validation Acc: 0.8126
Epoch: 006, Train Acc: 0.8606, Validation Acc: 0.8459
Epoch: 007, Train Acc: 0.8684, Validation Acc: 0.8611
Epoch: 008, Train Acc: 0.8669, Validation Acc: 0.8525
Epoch: 009, Train Acc: 0.8924, Validation Acc: 0.8858
Epoch: 010, Train Acc: 0.9027, Validation Acc: 0.8877
Epoch: 011, Train Acc: 0.9070, Validation Acc: 0.8925
Epoch: 012, Train Acc: 0.8659, Validation Acc: 0.8535


# Testing

In [5]:
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)

y_true = []
y_predicted = []
unique_classes = set()
for d in allData_test:
    y_true.append(d.y.cpu().numpy())
    
    pred,_ = inference(model, d.to(device=device), device=device)
    y_predicted.append(pred)


print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1198
           1       0.98      0.69      0.81       765
           2       0.79      0.98      0.88       469
           3       0.38      0.62      0.47       194

    accuracy                           0.88      2626
   macro avg       0.79      0.82      0.79      2626
weighted avg       0.91      0.88      0.88      2626



# Inference

#### You input 2 folders, one folder containing the json of nodes, the other contains the csv for edges.<br>don't mix them in one folder, bad things happen<br>if you have a npy file, please pass its abs path as this file will make the script run much faster

In [6]:
import numpy as np

pathToUser_Nodes = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test\nodes_457_test".replace('\\', '/')
pathToUser_Edges = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test\edges_457_test".replace('\\', '/')
outPath = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test".replace('\\', '/')
classification, embeddings = InferenceGCN(pathToUser_Nodes, pathToUser_Edges, outPath, multipleFiles='true', npyPath=None)

161it [00:01, 119.13it/s]


In [7]:
print(classification[0])
print('------------------------------------------------')
print(embeddings[0])
print('------------------------------------------------')

3
------------------------------------------------
[-0.03846025 -0.17173506 -0.03937334  0.02173497  0.01890503 -0.0314753
 -0.00729185  0.26403618 -0.02197982  0.08097303 -0.28724906 -0.05587701
 -0.02056482 -0.05012508 -0.03518136 -0.06395221  0.03898479  0.06709575
  0.06730913 -0.03636717 -0.02268501 -0.10186131 -0.08514097  0.04395713
 -0.01101964  0.19629902  0.06146738 -0.00851727  0.00891467  0.11484844
  0.08012139 -0.00926589 -0.04918414  0.08703671  0.21440484 -0.04170494
  0.20385553 -0.07583166  0.00930768  0.07672732  0.22144899 -0.15658973
 -0.00030579  0.02632725  0.02770882  0.0350372  -0.06439433 -0.01809283
  0.2530822   0.01152389 -0.06564894 -0.05314837  0.01734367 -0.02311965
  0.21260604 -0.2471974   0.02978747 -0.0651449  -0.05126182  0.12439404
  0.21490806  0.08118579  0.1811749   0.1640831 ]
------------------------------------------------


#### Reading the outputted CSV

In [8]:
import pandas as pd
newdf = pd.read_csv(f"{outPath}/embeddings.csv".replace('\\', '/'), header=None, index_col=None)
newdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,CWE457_Use_of_Uninitialized_Variable__char_poi...,-0.03846,-0.171735,-0.039373,0.021735,0.018905,-0.031475,-0.007292,0.264036,-0.02198,...,-0.247197,0.029787,-0.065145,-0.051262,0.124394,0.214908,0.081186,0.181175,0.164083,457
1,CWE457_Use_of_Uninitialized_Variable__char_poi...,-0.06759,-0.149206,-0.067126,-0.001707,-0.007422,-0.054312,-0.03264,0.272567,-0.007405,...,-0.240176,0.05194,-0.094397,-0.036213,0.102802,0.219235,0.109437,0.194767,0.178252,457
2,CWE457_Use_of_Uninitialized_Variable__char_poi...,-0.043826,-0.166558,-0.04483,0.017338,0.014337,-0.036036,-0.012401,0.26535,-0.020655,...,-0.245437,0.034518,-0.070322,-0.048298,0.119845,0.215537,0.086092,0.183262,0.16676,457
3,CWE457_Use_of_Uninitialized_Variable__double_0...,-0.084064,-0.142215,-0.081982,-0.014864,-0.021659,-0.066296,-0.04501,0.280213,0.004113,...,-0.240803,0.061936,-0.110293,-0.030127,0.094383,0.224784,0.12638,0.205471,0.18676,457
4,CWE457_Use_of_Uninitialized_Variable__double_0...,-0.065728,-0.147436,-0.065018,-0.00507,-0.008792,-0.05579,-0.034334,0.268446,-0.003915,...,-0.23677,0.05557,-0.094504,-0.03066,0.099059,0.214458,0.10759,0.195435,0.17361,457
