# Binary Graph Classification
#### Done on the smallest class 457<br>using the vulnerable and safe file of this class only

In [1]:
import pandas as pd
import numpy as np
import json
import torch
import os
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, Sequential, Linear, global_mean_pool
import pickle
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F




from sklearn.model_selection import train_test_split

#from gcnHelpersGraph import GCN, preProcessingOneDataPoint, train, test

# Training on Binary class
#### please put the folders of the CVE's in the script's directory<br>then inside each folder put 2 subfolders, one for the nodes and the other for edges
##### also for the safe class, name the folder 457_SAFE, and inside it nodes_457_SAFE and edges_457_SAFE

In [2]:

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)
allData = []

torch.cuda.empty_cache()

cve = '457'
print(f"-----------LOADING CVE {cve} Vulnerable------------------")


pathToEdges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}/edges_{cve}"

with open (f'{cve}/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
    features_matrices_list = np.load(f,  allow_pickle=True)

with open(f'{cve}/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
    nodes_targets_list = np.load(f,  allow_pickle=True)


print(f"num Graphs : {features_matrices_list.shape[0]}")


# Reading the adjacency list of each graph
adj_Lists = []
for filename in os.listdir(pathToEdges):
    f = os.path.join(pathToEdges, filename)
    df = pd.read_csv(f, header=None)
    df.dropna(inplace=True)             # <---- here is the dropping
    adj_Lists.append(df)
    


numVulnNodes = 0


# Reading the label of each node
Targets_List = []
for target in nodes_targets_list:
    numVulnNodes += sum(target)
    Targets_List.append(1)
    

numNodes = 0
# Reading the features of each node
node_Features_List = []
for node_feature in features_matrices_list:
    n = np.array(node_feature,dtype='int64')
    numNodes+= n.shape[0]
    node_Features_List.append(torch.tensor(n, dtype=torch.float))


assert (len(adj_Lists) == len(Targets_List))
assert (len(adj_Lists) == len(node_Features_List))


# Convert the edge list to use 0-based indices
adj_Tensors = []
for adj in adj_Lists:
    # the documentation strictly says to convert the adjacency list to a contiguous list
    adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   


#Iterate over each graph, make a Data object, then append to all our dataset
cntCorruptData=0
for i, adj in enumerate(adj_Tensors):
    d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor([1]))  # <--- from the documentation.
    try:
        d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
    except:
        print('DIMENSION ERROR')
        print(f"We have features for {len(node_Features_List[i])} Nodes ")
        print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
        cntCorruptData +=1
        continue
    allData.append(d)


print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
   

-----------LOADING CVE 457 Vulnerable------------------
num Graphs : 914
number of corrupted files due to missing node features for certain nodes = 0


### Loading the 457 SAFE dataset

In [3]:
print(f"-----------LOADING CVE {cve} SAFE------------------")


pathToEdges = f"D:/ClassWork/Guardista/4-Features_Extraction/{cve}_SAFE/edges_{cve}_SAFE"

with open (f'{cve}_SAFE/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
    features_matrices_list = np.load(f,  allow_pickle=True)

with open(f'{cve}_SAFE/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
    nodes_targets_list = np.load(f,  allow_pickle=True)


print(f"num Graphs : {features_matrices_list.shape[0]}")


# Reading the adjacency list of each graph
adj_Lists = []
for filename in os.listdir(pathToEdges):
    f = os.path.join(pathToEdges, filename)
    df = pd.read_csv(f, header=None)
    df.dropna(inplace=True)             # <---- here is the dropping
    adj_Lists.append(df)
    


numVulnNodes = 0


# Reading the label of each node
Targets_List = []
for target in nodes_targets_list:
    numVulnNodes += sum(target)
    Targets_List.append(0)
    

numNodes = 0
# Reading the features of each node
node_Features_List = []
for node_feature in features_matrices_list:
    n = np.array(node_feature,dtype='int64')
    numNodes+= n.shape[0]
    node_Features_List.append(torch.tensor(n, dtype=torch.float))


assert (len(adj_Lists) == len(Targets_List))
assert (len(adj_Lists) == len(node_Features_List))


# Convert the edge list to use 0-based indices
adj_Tensors = []
for adj in adj_Lists:
    # the documentation strictly says to convert the adjacency list to a contiguous list
    adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   


#Iterate over each graph, make a Data object, then append to all our dataset
cntCorruptData=0
for i, adj in enumerate(adj_Tensors):
    d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor([0]))  # <--- from the documentation.
    try:
        d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
    except:
        print('DIMENSION ERROR')
        print(f"We have features for {len(node_Features_List[i])} Nodes ")
        print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
        cntCorruptData +=1
        continue
    allData.append(d)


print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")

-----------LOADING CVE 457 SAFE------------------
num Graphs : 913
number of corrupted files due to missing node features for certain nodes = 0


### Train-Test-Split and data loader

In [4]:

# Train Test Split
allData_train, allData_test = train_test_split(allData, test_size=0.3, shuffle=True)


unique_classes = set()

for d in allData_train:
    unique_classes.add(int(d.y.numpy()))

print(f"Unique classes {unique_classes}")


#Our Data Loader
batch_size = 5
loader = DataLoader(allData_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(allData_test, batch_size=batch_size, shuffle=True)

Unique classes {0, 1}


# GCN Class

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

numClasses = 2



class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(33, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, numClasses)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64).to(device=device)
print(model)






optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-5)
criterion = torch.nn.CrossEntropyLoss()





def train():
    model.train()

    for data in loader:  # Iterate in batches over the training dataset.
         out = model(data.x.to(device=device), data.edge_index.to(device=device), data.batch.to(device=device))  # Perform a single forward pass.
         loss = criterion(out, data.y.to(device=device))  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.



def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x.to(device=device), data.edge_index.to(device=device), data.batch.to(device=device))  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y.to(device=device)).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.

def inference(model, data_point):
    model.eval().to(device=device)
    with torch.no_grad():
        x = data_point.x.to(device=device)
        edge_index = data_point.edge_index.to(device=device)
        batch = data_point.batch
        out = model(x, edge_index, batch)
        pred = out.argmax(dim=1).cpu().item()
    return pred


for epoch in range(1, 20):
    train()
    train_acc = test(loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')





# Saving the model
#with open('GCN.pkl', 'wb') as f:
#    pickle.dump(model.cpu(), f)






GCN(
  (conv1): GCNConv(33, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(64, 2, bias=True)
)
Epoch: 001, Train Acc: 0.5016
Epoch: 002, Train Acc: 0.4984
Epoch: 003, Train Acc: 0.5016
Epoch: 004, Train Acc: 0.4984
Epoch: 005, Train Acc: 0.5016
Epoch: 006, Train Acc: 0.4984
Epoch: 007, Train Acc: 0.4984
Epoch: 008, Train Acc: 0.4984
Epoch: 009, Train Acc: 0.4984
Epoch: 010, Train Acc: 0.5016
Epoch: 011, Train Acc: 0.4984
Epoch: 012, Train Acc: 0.5016
Epoch: 013, Train Acc: 0.5016
Epoch: 014, Train Acc: 0.5016
Epoch: 015, Train Acc: 0.4984
Epoch: 016, Train Acc: 0.5016
Epoch: 017, Train Acc: 0.5016
Epoch: 018, Train Acc: 0.5016
Epoch: 019, Train Acc: 0.4984


# Testing

In [8]:
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)

y_true = []
y_predicted = []
unique_classes = set()
for d in allData_test:
    y_true.append(d.y.cpu().numpy())
    
    pred = inference(model, d.to(device=device))
    y_predicted.append(pred)


print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67       276
           1       0.00      0.00      0.00       273

    accuracy                           0.50       549
   macro avg       0.25      0.50      0.33       549
weighted avg       0.25      0.50      0.34       549



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
