In [1]:
import pandas as pd
import numpy as np
import json
import torch
import os
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, Sequential
import pickle
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F




from sklearn.model_selection import train_test_split

from gcnHelpers import GCN, preProcessingOneDataPoint, train, test

# Training on Multiple classes
#### please put the folders of the CVE's in the script's directory<br>then inside each folder put 2 subfolders, one for the nodes and the other for edges

In [2]:
CVES_to_train_on = ['121', '191', '401', '457']

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)
allData = []

torch.cuda.empty_cache()

for encClass, cve in enumerate(CVES_to_train_on):
    print(f"-----------LOADING CVE {cve}------------------")

    
    pathToEdges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}/edges_{cve}"

    with open (f'{cve}/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
        features_matrices_list = np.load(f,  allow_pickle=True)

    with open(f'{cve}/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
        nodes_targets_list = np.load(f,  allow_pickle=True)

    
    print(f"num Graphs : {features_matrices_list.shape[0]}")


    # Reading the adjacency list of each graph
    adj_Lists = []
    for filename in os.listdir(pathToEdges):
        f = os.path.join(pathToEdges, filename)
        df = pd.read_csv(f, header=None)
        df.dropna(inplace=True)             # <---- here is the dropping
        adj_Lists.append(df)
        


    numVulnNodes = 0

    encoded = encClass + 1
    # Reading the label of each node
    Targets_List = []
    for target in nodes_targets_list:
        numVulnNodes += sum(target)
        encodedTarget = [encoded if i==1 or i == '1' else 0 for i in target]
        Targets_List.append(np.array(encodedTarget, dtype='int64'))
        

    numNodes = 0
    # Reading the features of each node
    node_Features_List = []
    for node_feature in features_matrices_list:
        n = np.array(node_feature,dtype='int64')
        numNodes+= n.shape[0]
        node_Features_List.append(torch.tensor(n, dtype=torch.float))


    assert (len(adj_Lists) == len(Targets_List))
    assert (len(adj_Lists) == len(node_Features_List))


    # Convert the edge list to use 0-based indices
    adj_Tensors = []
    for adj in adj_Lists:
        # the documentation strictly says to convert the adjacency list to a contiguous list
        adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   


    #Iterate over each graph, make a Data object, then append to all our dataset
    cntCorruptData=0
    for i, adj in enumerate(adj_Tensors):
        d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor(Targets_List[i]))  # <--- from the documentation.
        try:
            d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
        except:
            print('DIMENSION ERROR')
            print(f"We have features for {len(node_Features_List[i])} Nodes ")
            print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
            cntCorruptData +=1
            continue
        allData.append(d)


    print(f"num nodes : {numNodes} of which {numVulnNodes} are vulnerable")
    print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
    print(f"total number of files {len(adj_Tensors)}")
    print(f"number of files to be trained on = {len(adj_Tensors) - cntCorruptData}\n\n")
   

-----------LOADING CVE 121------------------
num Graphs : 5940
num nodes : 12564642 of which 20675 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 5940
number of files to be trained on = 5940


-----------LOADING CVE 191------------------
num Graphs : 4013
num nodes : 8937258 of which 18167 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 4013
number of files to be trained on = 4013


-----------LOADING CVE 401------------------
num Graphs : 2261
num nodes : 4553383 of which 8766 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 2261
number of files to be trained on = 2261


-----------LOADING CVE 457------------------
num Graphs : 914
num nodes : 1765018 of which 4138 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 914
number of

### Train-Test-Split and data loader

In [3]:

# Train Test Split
allData_train, allData_test = train_test_split(allData, test_size=0.3, shuffle=True)

unique_classes = set()

for d in allData_train:
    unique_classes.update(d.y.numpy())

print(f"Unique classes {unique_classes}")


#Our Data Loader
batch_size = 5
loader = DataLoader(allData_train, batch_size=batch_size, shuffle=True)

Unique classes {0, 1, 2, 3, 4}


# Reading CSVs and npy files
#### NOTICE that the CSV contains some nodes having edges going out to nowhere, I DROP THOSE EDGES COMPLETELY<br>this drop seems to cause some nodes to disappear forever


#### Also notice that some nodes in the edges' file don't have corresponding node features in the npy file

#### Mapping each node in the adjacency list, to its node ID in the node features<br>It is done by getting all unique nodes from the adjacency list, sorting them, then renaming each of them starting from 0

In [None]:
"""
# Determine the unique node IDs in the edge list
appender= np.concatenate([adj_Lists[i][0].to_numpy() for i in range(len(adj_Lists))])
appender2 = np.concatenate([adj_Lists[i][1].to_numpy() for i in range(len(adj_Lists))])
unique_node_ids = np.unique(np.concatenate((appender, appender2)))

# Create a dictionary that maps the old node IDs to new 0-based indices
node_id_to_index = {node_id: i for i, node_id in enumerate(unique_node_ids)}

# Convert the edge list to use 0-based indices
adj_Tensors = []
for adj in adj_Lists:
    adj2 = pd.DataFrame(data=adj)
    adj2.iloc[:,0] = adj.iloc[:,0].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])
    adj2.iloc[:,1] = adj.iloc[:,1].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])

    # the documentation strictly says to convert the adjacency list to a contiguous list
    adj_Tensors.append(torch.as_tensor(adj2.to_numpy(), dtype=torch.long).t().contiguous())     
    
"""
    

# GCN Class
#### If you want to change something, refer to gcnHelpers.py<br>that script contains literally anything related to the GCN architecture

In [4]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCN(hidden_channels=30, feature_dimension=33, lr=0.001, num_classes=5).to(device=device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-5)
penaltyWeights = torch.tensor( [    1.0018,      1355.54 ,  1533.44,     3153.71,       7580.941  ])  # <--- THOSE WEIGHTS ARE COMPUTED MANUALLY ACC TO AN EQUATION
criterion = torch.nn.CrossEntropyLoss(weight=penaltyWeights.to(device=device))





# Training
for epoch in range(1, 12):
    acc_accum = 0
    for bat, batchData in enumerate(loader):
        for i in range (len(batchData)):
            loss,_ = train(model, batchData[i].to(device=device), optimizer=optimizer, criterion= criterion, device=device)
            test_acc,_,_ = test(model, batchData[i].to(device=device), device=device)
            acc_accum += test_acc
    avg_acc = acc_accum / (batch_size*len(loader))
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, epoch Accuracy: {avg_acc:.4f}')



# Saving the model
with open('GCN.pkl', 'wb') as f:
    pickle.dump(model.cpu(), f)

GCN(
  (conv1): GCNConv(33, 60)
  (conv2): GCNConv(60, 60)
  (conv3): GCNConv(60, 30)
  (conv4): GCNConv(30, 16)
  (conv5): GCNConv(16, 5)
)
Epoch: 001, Loss: 0.1857, epoch Accuracy: 0.9247
Epoch: 002, Loss: 1.6210, epoch Accuracy: 0.9496
Epoch: 003, Loss: 0.0612, epoch Accuracy: 0.9604
Epoch: 004, Loss: 0.0480, epoch Accuracy: 0.9635
Epoch: 005, Loss: 0.0402, epoch Accuracy: 0.9664
Epoch: 006, Loss: 0.4948, epoch Accuracy: 0.9680
Epoch: 007, Loss: 0.0558, epoch Accuracy: 0.9713
Epoch: 008, Loss: 0.3413, epoch Accuracy: 0.9723
Epoch: 009, Loss: 0.0649, epoch Accuracy: 0.9741
Epoch: 010, Loss: 0.1531, epoch Accuracy: 0.9750
Epoch: 011, Loss: 0.0263, epoch Accuracy: 0.9758


# Testing

In [5]:
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)

y_true = []
y_predicted = []
unique_classes = set()
for d in allData_test:
    y_true.append(d.y.cpu().numpy())
    unique_classes.update(list(d.y.cpu().numpy()))
    _,pred,_ = test(model, d.to(device=device), device=device)
    y_predicted.append(pred.cpu().numpy())

print(unique_classes)

y_true = np.concatenate([y_true[i] for i in range(len(y_true))])
y_predicted = np.concatenate([y_predicted[i] for i in range(len(y_predicted))])
print(classification_report(y_true= y_true, y_pred= y_predicted))

{0, 1, 2, 3, 4}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   8349182
           1       0.08      0.88      0.15      6401
           2       0.09      0.96      0.16      5363
           3       0.03      0.94      0.06      2561
           4       0.13      0.82      0.22      1304

    accuracy                           0.98   8364811
   macro avg       0.27      0.91      0.32   8364811
weighted avg       1.00      0.98      0.99   8364811



# Testing on Entire file Classification

In [7]:
y_true = []
y_predicted = []
for d in allData_test:
    lable = np.max(d.y.cpu().numpy())
    if(lable == 1):
        y_true.append(1)
    elif(lable == 2):
        y_true.append(2)
    elif(lable == 3):
        y_true.append(3)
    elif(lable == 4):
        y_true.append(4)

    
    _,pred,_ = test(model, d.to(device=device), device=device)
    counts = np.bincount(pred.cpu().numpy())

    (count_0, count_1, count_2, count_3, count_4) =    (counts[0],\
                                            0 if len(counts) < 2 else counts[1],\
                                            0 if len(counts) < 3 else counts[2],\
                                            0 if len(counts) < 4 else counts[3], 0 if len(counts) < 5 else counts[4])            #Those were sanity (size) checks
    y_predicted.append( np.argmax(np.array([count_1, count_2, count_3, count_4])) + 1)
    


print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           1       0.94      0.87      0.90      1814
           2       0.95      0.86      0.90      1176
           3       0.65      0.98      0.78       663
           4       1.00      0.63      0.77       286

    accuracy                           0.87      3939
   macro avg       0.88      0.84      0.84      3939
weighted avg       0.90      0.87      0.87      3939



# Inference

### You input 2 folders, one folder containing the json of nodes, the other contains the csv for edges.<br>don't mix them in one folder, bad things happen

In [1]:
import gcnHelpers
import numpy as np

pathToUser_Nodes = r"D:\ClassWork\Guardista\4-Features_Extraction\tst\nod".replace('\\', '/')
pathToUser_Edges = "D:/ClassWork/Guardista/4-Features_Extraction/tst/edg".replace('\\', '/')
classification, embeddings, AvgPooled, MaxPooled = gcnHelpers.InferenceGCN(pathToUser_Nodes, pathToUser_Edges, pathToUser_Nodes, 'false') # <---- Notice I lazily put the output folder to user nodes

In [3]:
print(classification)
print('------------------------------------------------')
print(embeddings)
print('------------------------------------------------')
print(AvgPooled)
print('------------------------------------------------')
print(MaxPooled)


[0 0 0 ... 0 0 0]
------------------------------------------------
[[0.         0.         3.8064592  ... 3.821001   5.447873   0.        ]
 [0.35570067 0.         5.956406   ... 5.3900223  5.8631268  0.        ]
 [0.         0.         8.600882   ... 1.3311294  4.239421   0.        ]
 ...
 [0.         0.         5.048689   ... 5.5136294  5.5096493  0.        ]
 [1.3681096  0.         4.0339794  ... 4.427201   6.1081123  0.        ]
 [0.27392727 0.         5.350784   ... 5.0768337  5.245433   0.        ]]
------------------------------------------------
[0.784646   0.08142199 6.143399   5.2373586  5.606539   6.9538674
 0.01934045 9.283028   0.11684757 0.06016934 0.11146666 8.869153
 8.912067   6.3315077  9.01263    0.05021806]
------------------------------------------------
[ 25.076975    2.29146    64.17055    61.578133   50.404213   66.48347
   2.041499  166.96599     3.8428469   2.3604486   6.6041737  95.02131
 130.35931    69.392456  129.0811      3.4771945]
