# Graph Classification

In [1]:
import pandas as pd
import numpy as np
import json
import torch
import os
from torch_geometric.data import Data, Batch, Dataset
from torch_geometric.nn import GCNConv, Sequential, Linear, global_mean_pool
import pickle
from torch_geometric.loader import DataLoader
import torch.nn.functional as F
import torch_geometric.utils as ut
from sklearn.model_selection import train_test_split

from gcnHelpers import *


# Training on Multiple classes
#### please put the folders of the CVE's in the script's directory<br>then inside each folder put 2 subfolders, one for the nodes and the other for edges
##### example: folder 121 contains 4 subfolders (nodes_121, edges_121, feature_matrices, nodes_targets)<br>and feature_matrices and nodes_targets contain the npy files

In [2]:
# 121   122     ok
# 23    36      not ok
# 126   127     not ok
# 194   195     not ok

In [3]:
#CVES_to_train_on = {'762':4, '191':5, '134':6, '590':7}#, '23', '401', '457', '590', '690', '762', '78', '23', '36', '126', '127', '194', '195', '191']
#CVES_to_train_on = ['122_safe', '121_safe', '78_safe', '190_safe']
#CVES_to_train_on = ['762', '191', '134', '590']
#CVES_to_train_on = ['23', '126', '194', '401','690']
CVES_to_train_on = ['36', '127', '124', '195', '457']

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)
allData = []
numGraphs_List = []

torch.cuda.empty_cache()

for encClass, cve in enumerate(CVES_to_train_on):
    print(f"-----------LOADING CVE {cve}------------------")

    
    pathToEdges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}/edges_{cve}"

    with open (f'{cve}/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
        features_matrices_list = np.load(f,  allow_pickle=True)

    with open(f'{cve}/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
        nodes_targets_list = np.load(f,  allow_pickle=True)

    
    print(f"num Graphs : {features_matrices_list.shape[0]}")
    numGraphs_List.append(features_matrices_list.shape[0])


    # Reading the adjacency list of each graph
    adj_Lists = []
    for i, filename in enumerate(os.listdir(pathToEdges)):
       
        try:
            f = os.path.join(pathToEdges, filename)
            df = pd.read_csv(f, header=None)
            df.dropna(inplace=True)             # <---- here is the dropping
            adj_Lists.append(df)
        except:
            print('something wrong')
        


    numVulnNodes = 0

    encoded = encClass
    # Reading the label of each node
    Targets_List = []
    for i, target in enumerate(nodes_targets_list):
        if(i == len(adj_Lists)):
            break
        numVulnNodes += sum(target)
        Targets_List.append(encoded)
            

    numNodes = 0
    # Reading the features of each node
    node_Features_List = []
    
    for i, node_feature in enumerate(features_matrices_list):
        n = np.array(node_feature,dtype='int64')
        numNodes+= n.shape[0]
        node_Features_List.append(torch.tensor(n, dtype=torch.float))


    try:
        assert (len(adj_Lists) == len(Targets_List))
        assert (len(adj_Lists) == len(node_Features_List))
    except:
        print(len(adj_Lists))
        print(len(Targets_List))
        print(len(node_Features_List))


    # Convert the edge list to use 0-based indices
    adj_Tensors = []
    for adj in adj_Lists:
        # the documentation strictly says to convert the adjacency list to a contiguous list
        adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   


    #Iterate over each graph, make a Data object, then append to all our dataset
    cntCorruptData=0
    for i, adj in enumerate(adj_Tensors):
        d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor(Targets_List[i]))  # <--- from the documentation.
        try:
            d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
        except:
            print('DIMENSION ERROR')
            print(f"We have features for {len(node_Features_List[i])} Nodes ")
            print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
            cntCorruptData +=1
            continue
        allData.append(d)


    print(f"num nodes : {numNodes} of which {numVulnNodes} are vulnerable")
    print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
    print(f"total number of files {len(adj_Tensors)}")
    print(f"number of files to be trained on = {len(adj_Tensors) - cntCorruptData}\n\n")
   

-----------LOADING CVE 36------------------
num Graphs : 3314
num nodes : 9621197 of which 36603 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 3314
number of files to be trained on = 3314


-----------LOADING CVE 127------------------
num Graphs : 2778
num nodes : 6265829 of which 11860 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 2778
number of files to be trained on = 2778


-----------LOADING CVE 124------------------
num Graphs : 2788
num nodes : 5960770 of which 12671 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 2788
number of files to be trained on = 2788


-----------LOADING CVE 195------------------
num Graphs : 1592
num nodes : 3372296 of which 15757 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 1592
number 

#### Computing class weights to be used in a weighted loss function

In [4]:
total_Graphs = sum(numGraphs_List)
Class_Weights = [total_Graphs/i for i in numGraphs_List]
print(f"Weights for each class are:\n {Class_Weights}")

Weights for each class are:
 [3.435727217863609, 4.098632109431246, 4.083931133428981, 7.152010050251256, 12.457330415754923]


### Train-Val-Test-Split and data loader

In [5]:

# Train Test Split
allData_train, allData_test = train_test_split(allData, test_size=0.2, shuffle=True)

# Train Validation Split
allData_train, allData_val = train_test_split(allData_train, test_size=0.1, shuffle=True)

unique_classes = set()

for d in allData_train:
    unique_classes.add(int(d.y.numpy()))

print(f"Unique classes {unique_classes}")


#Our Data Loader
batch_size = 5
loader = DataLoader(allData_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(allData_test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(allData_val, batch_size=batch_size, shuffle=True)

Unique classes {0, 1, 2, 3, 4}


# GCN Class
#### anything related to the architecture

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

numClasses = len(list(unique_classes))

model = GCN(hidden_channels=128, numClasses=numClasses, numFeatures=33).to(device=device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-6)
penaltyWeights = torch.tensor( Class_Weights)  # <--- THOSE WEIGHTS ARE COMPUTED MANUALLY ACCORDING TO AN EQUATION
criterion = torch.nn.CrossEntropyLoss(weight=penaltyWeights.to(device=device))

for epoch in range(1, 13):
    train(model, loader, device, optimizer, criterion)
    train_acc = test(model, loader, device)
    val_acc = test(model, val_loader, device)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Validation Acc: {val_acc:.4f}')

# Saving the model
with open('GCN_class_g4.pkl', 'wb') as f:
    pickle.dump(model.cpu(), f)


Epoch: 001, Train Acc: 0.7389, Validation Acc: 0.7398
Epoch: 002, Train Acc: 0.7222, Validation Acc: 0.7300
Epoch: 003, Train Acc: 0.7625, Validation Acc: 0.7552
Epoch: 004, Train Acc: 0.8349, Validation Acc: 0.8474
Epoch: 005, Train Acc: 0.8149, Validation Acc: 0.8079
Epoch: 006, Train Acc: 0.8621, Validation Acc: 0.8507
Epoch: 007, Train Acc: 0.8836, Validation Acc: 0.8738
Epoch: 008, Train Acc: 0.8484, Validation Acc: 0.8266
Epoch: 009, Train Acc: 0.8636, Validation Acc: 0.8474
Epoch: 010, Train Acc: 0.9040, Validation Acc: 0.8760
Epoch: 011, Train Acc: 0.8591, Validation Acc: 0.8628
Epoch: 012, Train Acc: 0.9186, Validation Acc: 0.9023


# Testing

In [10]:
from sklearn.metrics import classification_report

with open('GCN_class_g4.pkl', 'rb') as f:
    model = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)

y_true = []
y_predicted = []
unique_classes = set()
for d in allData_test:
    y_true.append(d.y.cpu().numpy())
    
    pred,_ = inference(model, d.to(device=device), device=device)
    y_predicted.append(pred)


print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       647
           1       0.86      0.76      0.81       546
           2       0.78      0.88      0.83       565
           3       0.99      1.00      1.00       336
           4       0.96      1.00      0.98       184

    accuracy                           0.91      2278
   macro avg       0.92      0.92      0.92      2278
weighted avg       0.91      0.91      0.91      2278



# Inference

### Inference on only 1 datapoint

#### You input 2 folders, one folder containing the json of nodes, the other contains the csv for edges.<br>don't mix them in one folder, bad things happen<br>if you have a npy file, please pass its abs path as this file will make the script run much faster

In [None]:
import numpy as np

pathToUser_Nodes = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test\nodes_457_test".replace('\\', '/')
pathToUser_Edges = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test\edges_457_test".replace('\\', '/')
outPath = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test".replace('\\', '/')
classification, Embdding_df = InferenceGCN('GCN.pkl',pathToUser_Nodes, pathToUser_Edges, outPath, multipleFiles='true', npyPath=None)

In [None]:
print(classification[0])
print('------------------------------------------------')
print(Embdding_df.head())
print('------------------------------------------------')

#### Reading the outputted CSV

In [None]:
import pandas as pd
newdf = pd.read_csv(f"{outPath}/embeddings.csv".replace('\\', '/'), header=None, index_col=None)
newdf.head()

In [None]:


cve = '190'
outPath = f'D:/ClassWork/Guardista/4-Features_Extraction/{cve}'.replace('\\', '/')
gcnModelPath = "D:/ClassWork/Guardista/4-Features_Extraction/GCN_class_g1.pkl".replace('\\','/')
pathToUser_Edges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}/edges_{cve}".replace('\\', '/')


npyPath = outPath+'/features_matrices/features_matrices_' + cve + '.npy'


classification_1, embeddings_df_1 = InferenceGCN( gcnModelPath , pathToUser_Edges=pathToUser_Edges , outputPath=outPath, multipleFiles='true', npyPath=npyPath, cve=cve)
#embeddings_df_1.drop(columns=list(embeddings_df_1.columns)[-1], inplace=True)
embeddings_df_1.to_csv(f"{outPath}/embeddings_{cve}.csv", header=False, index=False)


number of corrupted files discarded = 0


100%|██████████| 4878/4878 [00:46<00:00, 104.27it/s]


In [None]:
cve = '78'
outPath = 'D:/ClassWork/Guardista/4-Features_Extraction/78'.replace('\\', '/')
gcnModelPath = "D:/ClassWork/Guardista/4-Features_Extraction/GCN_class_g1_safe".replace('\\','/')
pathToUser_Edges = "D:/ClassWork/Guardista/4-Features_Extraction/78/edges_78".replace('\\', '/')


npyPath = outPath+'/features_matrices/features_matrices_' + cve + '.npy'


classification_2, embeddings_df_2 = InferenceGCN( gcnModelPath , pathToUser_Edges=pathToUser_Edges , outputPath=outPath, multipleFiles='true', npyPath=npyPath, cve=cve)
#embeddings_df_1.drop(columns=list(embeddings_df_1.columns)[-1], inplace=True)
embeddings_df_2.to_csv(f"{outPath}/embeddings_{cve}.csv", header=False, index=False)

In [None]:
LabelColumn = embeddings_df_1.iloc[:, -1]
embeddings_df_1.drop(columns=list(embeddings_df_1.columns)[-1], inplace=True)
embeddings_df_2.drop(columns=(embeddings_df_2.columns)[0], inplace=True)
embeddings_df_2.drop(columns=(embeddings_df_2.columns)[-1], inplace=True)
finalDF = pd.concat([embeddings_df_1, embeddings_df_2, LabelColumn], axis=1, ignore_index=True)
print(len(finalDF.columns))
finalDF.to_csv(f"{outPath}/concatEmbeddings_{cve}.csv", header=False, index=False)

In [None]:

cve = '78_safe'
outPath = 'D:/ClassWork/Guardista/4-Features_Extraction/78_safe'.replace('\\', '/')
gcnModelPath = "D:/ClassWork/Guardista/4-Features_Extraction/GCN_class_g1.pkl".replace('\\','/')
pathToUser_Edges = "D:/ClassWork/Guardista/4-Features_Extraction/78_safe/edges_78_safe".replace('\\', '/')


npyPath = outPath+'/features_matrices/features_matrices_' + cve + '.npy'


classification_1, embeddings_df_1 = InferenceGCN( gcnModelPath , pathToUser_Edges=pathToUser_Edges , outputPath=outPath, multipleFiles='true', npyPath=npyPath, cve=cve)
#embeddings_df_1.drop(columns=list(embeddings_df_1.columns)[-1], inplace=True)
embeddings_df_1.to_csv(f"{outPath}/embeddings_{cve}.csv", header=False, index=False)

In [None]:
cve = '78_safe'
outPath = 'D:/ClassWork/Guardista/4-Features_Extraction/78_safe'.replace('\\', '/')
gcnModelPath = "D:/ClassWork/Guardista/4-Features_Extraction/GCN_class_g1_safe.pkl".replace('\\','/')
pathToUser_Edges = "D:/ClassWork/Guardista/4-Features_Extraction/78_safe/edges_78_safe".replace('\\', '/')


npyPath = outPath+'/features_matrices/features_matrices_' + cve + '.npy'


classification_2, embeddings_df_2 = InferenceGCN( gcnModelPath , pathToUser_Edges=pathToUser_Edges , outputPath=outPath, multipleFiles='true', npyPath=npyPath, cve=cve)
#embeddings_df_1.drop(columns=list(embeddings_df_1.columns)[-1], inplace=True)
embeddings_df_2.to_csv(f"{outPath}/embeddings_{cve}.csv", header=False, index=False)

In [None]:
LabelColumn = embeddings_df_1.iloc[:, -1]
embeddings_df_1.drop(columns=list(embeddings_df_1.columns)[-1], inplace=True)
embeddings_df_2.drop(columns=(embeddings_df_2.columns)[0], inplace=True)
embeddings_df_2.drop(columns=(embeddings_df_2.columns)[-1], inplace=True)
finalDF = pd.concat([embeddings_df_1, embeddings_df_2, LabelColumn], axis=1, ignore_index=True)
print(len(finalDF.columns))
finalDF.to_csv(f"{outPath}/concatEmbeddings_{cve}.csv", header=False, index=False)