# Graph Classification

In [1]:
import pandas as pd
import numpy as np
import json
import torch
import os
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, Sequential, Linear, global_mean_pool
import pickle
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

from gcnHelpers import *


# Training on Multiple classes
#### please put the folders of the CVE's in the script's directory<br>then inside each folder put 2 subfolders, one for the nodes and the other for edges
##### example: folder 121 contains 4 subfolders (nodes_121, edges_121, feature_matrices, nodes_targets)<br>and feature_matrices and nodes_targets contain the npy files

In [2]:
CVES_to_train_on = ['121', '191', '401', '457']

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)
allData = []
numGraphs_List = []

torch.cuda.empty_cache()

for encClass, cve in enumerate(CVES_to_train_on):
    print(f"-----------LOADING CVE {cve}------------------")

    
    pathToEdges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}/edges_{cve}"

    with open (f'{cve}/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
        features_matrices_list = np.load(f,  allow_pickle=True)

    with open(f'{cve}/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
        nodes_targets_list = np.load(f,  allow_pickle=True)

    
    print(f"num Graphs : {features_matrices_list.shape[0]}")
    numGraphs_List.append(features_matrices_list.shape[0])


    # Reading the adjacency list of each graph
    adj_Lists = []
    for filename in os.listdir(pathToEdges):
        f = os.path.join(pathToEdges, filename)
        df = pd.read_csv(f, header=None)
        df.dropna(inplace=True)             # <---- here is the dropping
        adj_Lists.append(df)
        


    numVulnNodes = 0

    encoded = encClass
    # Reading the label of each node
    Targets_List = []
    for target in nodes_targets_list:
        numVulnNodes += sum(target)
        Targets_List.append(encoded)
        

    numNodes = 0
    # Reading the features of each node
    node_Features_List = []
    for node_feature in features_matrices_list:
        n = np.array(node_feature,dtype='int64')
        numNodes+= n.shape[0]
        node_Features_List.append(torch.tensor(n, dtype=torch.float))


    assert (len(adj_Lists) == len(Targets_List))
    assert (len(adj_Lists) == len(node_Features_List))


    # Convert the edge list to use 0-based indices
    adj_Tensors = []
    for adj in adj_Lists:
        # the documentation strictly says to convert the adjacency list to a contiguous list
        adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   


    #Iterate over each graph, make a Data object, then append to all our dataset
    cntCorruptData=0
    for i, adj in enumerate(adj_Tensors):
        d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor(Targets_List[i]))  # <--- from the documentation.
        try:
            d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
        except:
            print('DIMENSION ERROR')
            print(f"We have features for {len(node_Features_List[i])} Nodes ")
            print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
            cntCorruptData +=1
            continue
        allData.append(d)


    print(f"num nodes : {numNodes} of which {numVulnNodes} are vulnerable")
    print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
    print(f"total number of files {len(adj_Tensors)}")
    print(f"number of files to be trained on = {len(adj_Tensors) - cntCorruptData}\n\n")
   

-----------LOADING CVE 121------------------
num Graphs : 5940
num nodes : 12564642 of which 20675 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 5940
number of files to be trained on = 5940


-----------LOADING CVE 191------------------
num Graphs : 4013
num nodes : 8937258 of which 18167 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 4013
number of files to be trained on = 4013


-----------LOADING CVE 401------------------
num Graphs : 2261
num nodes : 4553383 of which 8766 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 2261
number of files to be trained on = 2261


-----------LOADING CVE 457------------------
num Graphs : 914
num nodes : 1765018 of which 4138 are vulnerable
number of corrupted files due to missing node features for certain nodes = 0
total number of files 914
number of

#### Computing class weights to be used in a weighted loss function

In [3]:
total_Graphs = sum(numGraphs_List)
Class_Weights = [total_Graphs/i for i in numGraphs_List]
print(f"Weights for each class are:\n {Class_Weights}")

Weights for each class are:
 [2.21010101010101, 3.2713680538250687, 5.806280406899602, 14.36323851203501]


### Train-Val-Test-Split and data loader

In [4]:

# Train Test Split
allData_train, allData_test = train_test_split(allData, test_size=0.2, shuffle=True)

# Train Validation Split
allData_train, allData_val = train_test_split(allData_train, test_size=0.1, shuffle=True)

unique_classes = set()

for d in allData_train:
    unique_classes.add(int(d.y.numpy()))

print(f"Unique classes {unique_classes}")


#Our Data Loader
batch_size = 5
loader = DataLoader(allData_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(allData_test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(allData_val, batch_size=batch_size, shuffle=True)

Unique classes {0, 1, 2, 3}


# GCN Class
#### anything related to the architecture

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

numClasses = len(list(unique_classes))




model = GCN(hidden_channels=64, numClasses=numClasses, numFeatures=33).to(device=device)
print(model)






optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-5)
penaltyWeights = torch.tensor( Class_Weights)  # <--- THOSE WEIGHTS ARE COMPUTED MANUALLY ACCORDING TO AN EQUATION
criterion = torch.nn.CrossEntropyLoss(weight=penaltyWeights.to(device=device))






for epoch in range(1, 13):
    train(model, loader, device, optimizer, criterion)
    train_acc = test(model, loader, device)
    val_acc = test(model, val_loader, device)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Validation Acc: {val_acc:.4f}')





# Saving the model
with open('GCN.pkl', 'wb') as f:
    pickle.dump(model.cpu(), f)






GCN(
  (conv1): GCNConv(33, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(64, 4, bias=True)
)
Epoch: 001, Train Acc: 0.4984, Validation Acc: 0.5090
Epoch: 002, Train Acc: 0.7621, Validation Acc: 0.7536
Epoch: 003, Train Acc: 0.7828, Validation Acc: 0.7840
Epoch: 004, Train Acc: 0.7456, Validation Acc: 0.7431
Epoch: 005, Train Acc: 0.8030, Validation Acc: 0.8002
Epoch: 006, Train Acc: 0.8017, Validation Acc: 0.8021
Epoch: 007, Train Acc: 0.8574, Validation Acc: 0.8516
Epoch: 008, Train Acc: 0.8830, Validation Acc: 0.8677
Epoch: 009, Train Acc: 0.8620, Validation Acc: 0.8592
Epoch: 010, Train Acc: 0.8793, Validation Acc: 0.8839
Epoch: 011, Train Acc: 0.8926, Validation Acc: 0.8887
Epoch: 012, Train Acc: 0.9013, Validation Acc: 0.8991


# Testing

In [6]:
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)

y_true = []
y_predicted = []
unique_classes = set()
for d in allData_test:
    y_true.append(d.y.cpu().numpy())
    
    pred,_ = inference(model, d.to(device=device), device=device)
    y_predicted.append(pred)


print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1188
           1       0.83      0.92      0.87       767
           2       0.83      0.93      0.88       481
           3       0.46      0.11      0.18       190

    accuracy                           0.90      2626
   macro avg       0.78      0.74      0.73      2626
weighted avg       0.88      0.90      0.88      2626



# Inference

#### You input 2 folders, one folder containing the json of nodes, the other contains the csv for edges.<br>don't mix them in one folder, bad things happen<br>if you have a npy file, please pass its abs path as this file will make the script run much faster

In [7]:
import numpy as np

pathToUser_Nodes = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test\nodes_457_test".replace('\\', '/')
pathToUser_Edges = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test\edges_457_test".replace('\\', '/')
outPath = r"D:\ClassWork\Guardista\4-Features_Extraction\test\457_test".replace('\\', '/')
classification, embeddings = InferenceGCN(pathToUser_Nodes, pathToUser_Edges, outPath, multipleFiles='true', npyPath=None)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160


In [8]:
print(classification[0])
print('------------------------------------------------')
print(embeddings[0])
print('------------------------------------------------')

1
------------------------------------------------
[-0.01425866  0.00964587 -0.0248956  -0.05527318  0.00654169  0.00597529
  0.04964018  0.07472066  0.2547075   0.05761394 -0.07598666  0.04683467
 -0.1981306  -0.01381772 -0.00594068  0.13489778  0.05207491 -0.04593862
 -0.06843887 -0.17021583  0.02443486 -0.03699596 -0.12537734 -0.01297841
 -0.03448276 -0.03025983  0.10159049 -0.05370921 -0.02003104  0.03488482
  0.24602173 -0.05476311 -0.2632475  -0.03785818  0.16886538  0.07249996
  0.05839108 -0.06247886  0.04957579  0.01944224  0.070255    0.02622986
  0.05911631 -0.08293047 -0.08957326  0.12523521 -0.02537146 -0.03732213
  0.21424401  0.00181731  0.01555901 -0.0749732  -0.02018991 -0.0464328
 -0.19937828  0.03486762 -0.18280306 -0.01232024  0.07163822 -0.11567902
  0.09055584 -0.1092463   0.18654798  0.08254305]
------------------------------------------------


#### Reading the outputted CSV

In [9]:
import pandas as pd
newdf = pd.read_csv(f"{outPath}/embeddings.csv".replace('\\', '/'), header=None, index_col=None)
newdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,CWE457_Use_of_Uninitialized_Variable__char_poi...,-0.014259,0.009646,-0.024896,-0.055273,0.006542,0.005975,0.04964,0.074721,0.254707,...,0.034868,-0.182803,-0.01232,0.071638,-0.115679,0.090556,-0.109246,0.186548,0.082543,CWE457
1,CWE457_Use_of_Uninitialized_Variable__char_poi...,-0.041305,0.023144,-0.03934,-0.073968,-0.017634,-0.016639,0.057879,0.096288,0.250582,...,0.014195,-0.174585,-0.034068,0.086569,-0.122454,0.103462,-0.094001,0.190281,0.103685,CWE457
2,CWE457_Use_of_Uninitialized_Variable__char_poi...,-0.026269,0.013434,-0.03216,-0.061153,-0.002854,-0.003262,0.053972,0.083537,0.255258,...,0.026661,-0.181065,-0.020981,0.076601,-0.119521,0.096819,-0.104968,0.189877,0.092368,CWE457
3,CWE457_Use_of_Uninitialized_Variable__double_0...,-0.06421,0.033937,-0.048949,-0.092147,-0.039621,-0.039377,0.065252,0.116362,0.250451,...,-0.003604,-0.172618,-0.054271,0.099797,-0.127058,0.118339,-0.085054,0.19735,0.123417,CWE457
4,CWE457_Use_of_Uninitialized_Variable__double_0...,-0.039051,0.022689,-0.038575,-0.072574,-0.015784,-0.01441,0.057055,0.094325,0.249551,...,0.015565,-0.173711,-0.032124,0.085513,-0.121968,0.101585,-0.093938,0.188728,0.1016,CWE457
