In [1]:
import pandas as pd
import numpy as np
import json
import torch
import os
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, Sequential
import pickle
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F


(4013,)
(2986, 33)


In [None]:


# Input the Folders to train on
cve = '191'
pathToEdges = "D:/ClassWork/Guardista/4-Features_Extraction/191/edges_191"

with open ('191/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
    features_matrices_list = np.load(f,  allow_pickle=True)

with open('191/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
    nodes_targets_list = np.load(f,  allow_pickle=True)

print(features_matrices_list.shape)
#print(features_matrices_list)
print(features_matrices_list[0].shape)


torch.cuda.empty_cache()



# Reading the adjacency list of each graph
adj_Lists = []
for filename in os.listdir(pathToEdges):
    f = os.path.join(pathToEdges, filename)
    df = pd.read_csv(f, header=None)
    df.dropna(inplace=True)             # <---- here is the dropping
    adj_Lists.append(df)
    
# Reading the label of each node
Targets_List = []
for target in nodes_targets_list:
    Targets_List.append(np.array(target, dtype='int64'))
    
# Reading the features of each node
node_Features_List = []
for node_feature in features_matrices_list:
    n = np.array(node_feature,dtype='int64')
    node_Features_List.append(torch.tensor(n, dtype=torch.float))




assert (len(adj_Lists) == len(Targets_List))
assert (len(adj_Lists) == len(node_Features_List))


# Convert the edge list to use 0-based indices
adj_Tensors = []
for adj in adj_Lists:
    # the documentation strictly says to convert the adjacency list to a contiguous list
    adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   




from sklearn.model_selection import train_test_split

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)
allData = []

#A set to know the number of classes
numClasses = set()

#Iterate over each graph, make a Data object, then append to all our dataset
cntCorruptData=0
for i, adj in enumerate(adj_Tensors):
    numClasses.update(nodes_targets_list[i])
    d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor(nodes_targets_list[i]))  # <--- from the documentation.
    try:
        d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
    except:
        print('DIMENSION ERROR')
        print(f"We have features for {len(node_Features_List[i])} Nodes ")
        print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
        cntCorruptData +=1
        continue
    allData.append(d)


numClasses = len(numClasses)


print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
print(f"total number of files {len(adj_Tensors)}")
print(f"number of files to be trained on = {len(adj_Tensors) - cntCorruptData}")
print(f"we Have {numClasses} Classes")




# 457 CVE

In [None]:


# Input the Folders to train on
cve = '457'
pathToEdges = "D:/ClassWork/Guardista/4-Features_Extraction/457/edges_457"

with open ('457/features_matrices/features_matrices_'+cve+'.npy', 'rb') as f:
    features_matrices_list = np.load(f,  allow_pickle=True)

with open('457/nodes_targets/nodes_targets_'+cve+'.npy', 'rb') as f:
    nodes_targets_list = np.load(f,  allow_pickle=True)

print(features_matrices_list.shape)
#print(features_matrices_list)
print(features_matrices_list[0].shape)


torch.cuda.empty_cache()



# Reading the adjacency list of each graph
adj_Lists = []
for filename in os.listdir(pathToEdges):
    f = os.path.join(pathToEdges, filename)
    df = pd.read_csv(f, header=None)
    df.dropna(inplace=True)             # <---- here is the dropping
    adj_Lists.append(df)
    
# Reading the label of each node
Targets_List = []
for target in nodes_targets_list:
    encodedTarget = [2 if i else 0 for i in target]
    Targets_List.append(np.array(encodedTarget, dtype='int64'))
    
# Reading the features of each node
node_Features_List = []
for node_feature in features_matrices_list:
    n = np.array(node_feature,dtype='int64')
    node_Features_List.append(torch.tensor(n, dtype=torch.float))




assert (len(adj_Lists) == len(Targets_List))
assert (len(adj_Lists) == len(node_Features_List))


# Convert the edge list to use 0-based indices
adj_Tensors = []
for adj in adj_Lists:
    # the documentation strictly says to convert the adjacency list to a contiguous list
    adj_Tensors.append(torch.as_tensor(adj.to_numpy(), dtype=torch.long).t().contiguous())   




from sklearn.model_selection import train_test_split

#List containing all input data of type Data (Pytorch geometric datastructure that holds a single graph)


#A set to know the number of classes
numClasses = set()

#Iterate over each graph, make a Data object, then append to all our dataset
cntCorruptData=0
for i, adj in enumerate(adj_Tensors):
    numClasses.update(nodes_targets_list[i])
    d = Data(x=node_Features_List[i], edge_index=adj, y=torch.as_tensor(nodes_targets_list[i]))  # <--- from the documentation.
    try:
        d.validate(raise_on_error=True)     # <--------- this line makes sure each input graph strictly follows the correct rules, to evade errors
    except:
        print('DIMENSION ERROR')
        print(f"We have features for {len(node_Features_List[i])} Nodes ")
        print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Unique Nodes")
        cntCorruptData +=1
        continue
    allData.append(d)


numClasses = 3


print(f"number of corrupted files due to missing node features for certain nodes = {cntCorruptData}")
print(f"total number of files {len(adj_Tensors)}")
print(f"number of files to be trained on = {len(adj_Tensors) - cntCorruptData}")
print(f"we Have {numClasses} Classes")


# Train Test Split
allData_train, allData_test = train_test_split(allData, test_size=0.3)

#Our Data Loader
batch_size = 5
loader = DataLoader(allData_train, batch_size=batch_size, shuffle=True)

# Reading CSVs and npy files
#### NOTICE that the CSV contains some nodes having edges going out to nowhere, I DROP THOSE EDGES COMPLETELY<br>this drop seems to cause some nodes to disappear forever


#### Also notice that some nodes in the edges' file don't have corresponding node features in the npy file

#### Mapping each node in the adjacency list, to its node ID in the node features<br>It is done by getting all unique nodes from the adjacency list, sorting them, then renaming each of them starting from 0

In [3]:
"""
# Determine the unique node IDs in the edge list
appender= np.concatenate([adj_Lists[i][0].to_numpy() for i in range(len(adj_Lists))])
appender2 = np.concatenate([adj_Lists[i][1].to_numpy() for i in range(len(adj_Lists))])
unique_node_ids = np.unique(np.concatenate((appender, appender2)))

# Create a dictionary that maps the old node IDs to new 0-based indices
node_id_to_index = {node_id: i for i, node_id in enumerate(unique_node_ids)}

# Convert the edge list to use 0-based indices
adj_Tensors = []
for adj in adj_Lists:
    adj2 = pd.DataFrame(data=adj)
    adj2.iloc[:,0] = adj.iloc[:,0].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])
    adj2.iloc[:,1] = adj.iloc[:,1].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])

    # the documentation strictly says to convert the adjacency list to a contiguous list
    adj_Tensors.append(torch.as_tensor(adj2.to_numpy(), dtype=torch.long).t().contiguous())     
    
"""
    

'\n# Determine the unique node IDs in the edge list\nappender= np.concatenate([adj_Lists[i][0].to_numpy() for i in range(len(adj_Lists))])\nappender2 = np.concatenate([adj_Lists[i][1].to_numpy() for i in range(len(adj_Lists))])\nunique_node_ids = np.unique(np.concatenate((appender, appender2)))\n\n# Create a dictionary that maps the old node IDs to new 0-based indices\nnode_id_to_index = {node_id: i for i, node_id in enumerate(unique_node_ids)}\n\n# Convert the edge list to use 0-based indices\nadj_Tensors = []\nfor adj in adj_Lists:\n    adj2 = pd.DataFrame(data=adj)\n    adj2.iloc[:,0] = adj.iloc[:,0].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])\n    adj2.iloc[:,1] = adj.iloc[:,1].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])\n\n    # the documentation strictly says to convert the adjacency list to a contiguous list\n    adj_Tensors.append(torch.as_tensor(adj2.to_numpy(), dtype=torch.long).t().contiguous())     \n    \n'

### Finalizing our input dataset

number of corrupted files due to missing node features for certain nodes = 0
total number of files 4013
number of files to be trained on = 4013
we Have 2 Classes


# GCN Class

In [6]:

# Customize this as you wish, just make sure the first conv layer takes feature_dimension, and the last hidden conv layer outputs num_classes
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels , feature_dimension, num_classes):
        super().__init__()
        self.conv1 = GCNConv(feature_dimension, 60)
        self.conv2 = GCNConv(60, 50)
        self.conv3 = GCNConv(50, 40)
        self.conv4 = GCNConv(40, 16)
        self.conv5 = GCNConv(16, num_classes)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.005, weight_decay=5e-4)
    
    def forward(self, x, edge_index):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix 
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        Embedding = x
        x = self.conv5(x, edge_index)
        return x, F.log_softmax(x, dim=1), Embedding   # X is used for the loss computation,  F.log_softmax is the classification, Embedding is the emb

    
    
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCN(hidden_channels=30, feature_dimension=33, num_classes=2).to(device=device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-5)
penaltyWeights = torch.tensor([0.50102, 1000.5859])
criterion = torch.nn.CrossEntropyLoss(weight=penaltyWeights.to(device=device))

def train(model, data):
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    
    out = model(data.x.to(device=device), data.edge_index.to(device=device))  # Perform a single forward pass.
    loss = criterion(out[1], data.y.to(device=device))  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    embOut = out[2]
    return loss, embOut



def test(model, data):
    model.eval()
    out = model(data.x.to(device=device), data.edge_index.to(device=device))
    pred = out[1].argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred == data.y  # Check against ground-truth labels for test nodes.
    test_acc = int(test_correct.sum()) / int(len(data.x))  # Derive ratio of correct predictions.

    EmbOutput = out[2]
    return test_acc, pred, EmbOutput


# Training
for epoch in range(1, 2):
    acc_accum = 0
    for bat, batchData in enumerate(loader):
        for i in range (len(batchData)):
            loss,_ = train(model, batchData[i].to(device=device))
            test_acc,_,_ = test(model, batchData[i].to(device=device))
            acc_accum += test_acc
    avg_acc = acc_accum / (batch_size*len(loader))
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, epoch Accuracy: {avg_acc:.4f}')



# Saving the model
#with open('GCN.pkl', 'wb') as f:
#    pickle.dump(model, f)

GCN(
  (conv1): GCNConv(33, 60)
  (conv2): GCNConv(60, 50)
  (conv3): GCNConv(50, 40)
  (conv4): GCNConv(40, 16)
  (conv5): GCNConv(16, 2)
)
Epoch: 001, Loss: 0.0403, epoch Accuracy: 0.9170


# Testing

In [9]:
from sklearn.metrics import classification_report

y_true = []
y_predicted = []
for d in allData_test:
    y_true.append(d.y.cpu().numpy())
    _,pred,_ = test(model, d.to(device=device))
    y_predicted.append(pred.cpu().numpy())


y_true = np.concatenate([y_true[i] for i in range(len(y_true))])
y_predicted = np.concatenate([y_predicted[i] for i in range(len(y_predicted))])
print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99   2682307
           1       0.13      1.00      0.23      5459

    accuracy                           0.99   2687766
   macro avg       0.57      0.99      0.61   2687766
weighted avg       1.00      0.99      0.99   2687766



# Testing on Entire file Classification

In [10]:
y_true = []
y_predicted = []
for d in allData_test:
    if(np.sum(d.y.cpu().numpy(), dtype=np.int32) > 0):
        y_true.append(1)
    else:
        y_true.append(0)
    
    _,pred,_ = test(model, d.to(device=device))
    if(np.sum(pred.cpu().numpy(), dtype=np.int32) > 0):
        y_predicted.append(1)
    else:
        y_predicted.append(0)
    


print(classification_report(y_true= y_true, y_pred= y_predicted))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1204

    accuracy                           1.00      1204
   macro avg       1.00      1.00      1.00      1204
weighted avg       1.00      1.00      1.00      1204



# Inference
### First, Let's define some a function encapsulating all the preprocessing steps done<br>FOR A SINGLE GRAPH ONLY

In [None]:
from statFeaturesUtil import features_per_graph_per_node

def preProcessingOneDataPoint(pathToUserNodes, pathToUserEdges):

    print(pathToUserNodes)
    features_matrices_list = features_per_graph_per_node(pathToUserNodes)


    for filename in os.listdir(pathToUserEdges):
        f = str(os.path.join(pathToUserEdges, filename)).replace('\\', '/')
        pddf = pd.read_csv(f, header=None)
        
        pddf.dropna(inplace=True)
        #df.rename(index={0: "source", 1: "target"})
        adj_Lists = pddf
        print(pddf.head())

        
    
    n = np.array(features_matrices_list[0],dtype='int64')
    node_Features_List = torch.tensor(n, dtype=torch.float)



    # Determine the unique node IDs in the edge list
    unique_node_ids=  np.unique(np.concatenate((adj_Lists.iloc[:, 0].to_numpy(), adj_Lists.iloc[:, 1].to_numpy())  ))

    # Create a dictionary that maps the old node IDs to new 0-based indices
    node_id_to_index = {node_id: i for i, node_id in enumerate(unique_node_ids)}

    # Convert the edge list to use 0-based indices
    
        
    #print(df[df.columns[1]].isna().sum())
    if(adj_Lists.loc[:,1].isna().any()):
        print(adj_Lists[adj_Lists[adj_Lists.columns[1]].isna()])
    #print(adj[:,0])
    
    adj2 = pd.DataFrame(data=adj_Lists)
    adj2.iloc[:,0] = adj_Lists.iloc[:,0].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])
    adj2.iloc[:,1] = adj_Lists.iloc[:,1].apply(lambda x: -1 if x==-1 else node_id_to_index [int(x)])

    adj_Tensors = torch.as_tensor(adj2.to_numpy(), dtype=torch.long).t().contiguous()


        

    d = Data(x=node_Features_List, edge_index=adj_Tensors, y=0)
    try:
        d.validate(raise_on_error=True)
    except:
        print('DIMENSION ERROR')
        print(f"We have features for {len(node_Features_List[i])} Nodes ")
        print(f"But the adjacency list contains {max(set(np.array(adj[0,:])))} Nodes")
        return False
    

    return d

### You input 2 folders, one folder containing the json of nodes, the other contains the csv for edges.<br>don't mix them in one folder, bad things happen

In [None]:


pathToUser_Nodes = r"D:\ClassWork\Guardista\4-Features_Extraction\tst\nod".replace('\\', '/')
pathToUser_Edges = "D:/ClassWork/Guardista/4-Features_Extraction/tst/edg".replace('\\', '/')

inputPoint = preProcessingOneDataPoint(pathToUser_Nodes, pathToUser_Edges)
with open('GCN.pkl', 'rb') as f:
    model = pickle.load(f)
_,classification,Embedding = test(model, inputPoint)

In [None]:
print(classification)

print(Embedding)

print(Embedding.shape)