In [1]:
import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm
from torch_geometric.data import Data
from sklearn.metrics import confusion_matrix, f1_score, \
    accuracy_score, precision_score, recall_score
from torch_geometric.data import Dataset
from sklearn.metrics import roc_auc_score

import clang.cindex 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc

from sys import platform

import torch
from torch.nn import Linear, Dropout
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool as gap, global_max_pool as gmp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [2]:
# Configure libclang path
if platform == "linux" or platform == "linux2":
    print("Linux")
    clang.cindex.Config.set_library_file('/usr/lib/llvm-14/lib/libclang.so')
elif platform == "darwin":
    print("OS X")
    clang.cindex.Config.set_library_file('/Library/Developer/CommandLineTools/usr/lib/libclang.dylib')
elif platform == "win32":
    print("Windows")
    clang.cindex.Config.set_library_file('D:/Project/LLVM/bin/libclang.dll')

# Verify if libclang is loaded
print(clang.cindex.Config.loaded)  # Should print `True`

Linux
False


In [3]:
def save_ast(node):
    """Store the children of the AST node."""
    node.children = list(node.get_children())
    for child in node.children:
        save_ast(child)

def number_ast_nodes(node, counter=1):
    """Assign unique identifiers to each node in the AST."""
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    """Generate an edge list from the AST."""
    edges = [[], []]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges[0].append(node.identifier - 1)
            edges[1].append(child.identifier - 1)
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)

def generate_features(ast_root):
    """Generate node features for the AST."""
    features = []

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        node_id = node.identifier
        features.append([node_id, out_degree])

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)
    return torch.tensor(features, dtype=torch.float)

def clang_process(testcase):
    """Process a test case using Clang to generate AST, edges, and features."""
    parse_list = [(testcase.filename, testcase.code)]

    index = clang.cindex.Index.create()
    translation_unit = index.parse(path=testcase.filename, unsaved_files=parse_list)
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    number_ast_nodes(ast_root)

    edges_embedding = generate_edgelist(ast_root)
    nodes_embedding = generate_features(ast_root)
    
    # Check if it have a vuln or not
    
    
    if hasattr(testcase, "vuln"):
        y = torch.tensor([testcase.vuln], dtype=torch.int64)
    else:
        print("Attribute 'vuln' not found in testcase.")


    # Clean up Clang objects
    del translation_unit, ast_root, index

    return Data(x=nodes_embedding, edge_index=edges_embedding, y=y)

class GenDataset(Dataset):
    def __init__(self, root, csv_path, transform=None, pre_transform=None):
        """
        Args:
            root (str): Root directory where processed data will be stored.
            csv_path (str): Path to the CSV file containing the dataset.
            transform: Optional transform to be applied to the data.
            pre_transform: Optional pre-transform to be applied to the data.
        """
        self.csv_path = csv_path
        super(GenDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return []  # No raw files are needed

    @property
    def processed_file_names(self):
        """Return a list of processed file names."""
        if not hasattr(self, 'data'):
            self.data = pd.read_csv(self.csv_path)
        return [f'data_{i}.pt' for i in range(len(self.data))]

    def download(self):
        """No download needed since we are using a local CSV file."""
        pass

    def process(self):
        """Load the CSV file and process each row into a graph."""
        self.data = pd.read_csv(self.csv_path)
        for index, vuln in tqdm(self.data.iterrows(), total=len(self.data)):
            data = clang_process(vuln)
            torch.save(data, os.path.join(self.processed_dir, f'data_{index}.pt'))

    def len(self):
        """Return the number of graphs in the dataset."""
        return len(self.processed_file_names)

    def get(self, idx):
        """Load a graph from disk."""
        return torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'), weights_only=False)

In [4]:
import torch
from torch.nn import Linear, Dropout
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool as gap, global_max_pool as gmp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# Hyperparameters
embedding_size = 128
dropout_rate = 0.3
learning_rate = 0.001
patience = 5
num_epochs = 100

class GCN(torch.nn.Module):
    def __init__(self,num_features):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # GCN layers
        # self.initial_conv = GCNConv(dataset.num_features, embedding_size) # to translate our node features into the size of the embedding
        self.initial_conv = GCNConv(num_features, embedding_size) 
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        
        # pooling layer
        #self.pool = TopKPooling(embedding_size, ratio=0.8)
        #dropout layer
        #self.dropout = Dropout(p=0.2)

        # Output layer
        self.lin1 = Linear(embedding_size*2, 128) # linear output layer ensures that we get a continuous unbounded output value. It input is the flattened vector (embedding size *2) from the pooling layer (mean and max)
        self.lin2 = Linear(128, 128)
        self.lin3 = Linear(128, 1)
        

        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()

    def forward(self, x, edge_index, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.relu(hidden)

        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.relu(hidden)

        hidden = self.conv2(hidden, edge_index)
        hidden = F.relu(hidden)
        #hidden = self.dropout(hidden)
        
        # Global Pooling (stack different aggregations)
        hidden = torch.cat([gmp(hidden, batch_index), 
                            gap(hidden, batch_index)], dim=1)
        
        # Apply a final (linear) classifier.
        out = self.lin1(hidden)
        out = self.act1(out)
        out = self.lin2(out)
        out = self.act2(out)
        #out = F.dropout(out, p=0.5, training=self.training)
        out = self.lin3(out)
        out = torch.sigmoid(out)

        # return out, hidden
        return out

In [5]:
class TestCase:
    def __init__(self, filename, code, vuln=None):
        self.filename = filename
        self.code = code
        self.vuln = vuln

# Load the source code files
with open('Sourcecode/no_vuln.c', 'r') as f:
    clear_area_code = f.read()

with open('Sourcecode/cwe_119_1.c', 'r') as f:
    hello_world_code = f.read()

# Create test case objects with vuln set to False
clear_area_testcase = TestCase(filename='clear_area.c', code=clear_area_code, vuln=False)
vuln_testcase = TestCase(filename='cwe_119_1.c', code=hello_world_code, vuln=True)

# Preprocess the source code files
clear_area_data = clang_process(clear_area_testcase)
vuln_data = clang_process(vuln_testcase)

In [6]:
print("clear_area.c features:", clear_area_data.x)
print("cwe_119_1.c features:", vuln_data.x)

clear_area.c features: tensor([[1.0000e+00, 6.8800e+02],
        [2.0000e+00, 0.0000e+00],
        [3.0000e+00, 1.0000e+00],
        ...,
        [2.5800e+03, 0.0000e+00],
        [2.5810e+03, 1.0000e+00],
        [2.5820e+03, 0.0000e+00]])
cwe_119_1.c features: tensor([[ 1.,  1.],
        [ 2.,  2.],
        [ 3.,  0.],
        [ 4.,  4.],
        [ 5.,  2.],
        [ 6.,  0.],
        [ 7.,  1.],
        [ 8.,  0.],
        [ 9.,  1.],
        [10.,  1.],
        [11.,  0.],
        [12.,  2.],
        [13.,  1.],
        [14.,  0.],
        [15.,  1.],
        [16.,  0.],
        [17.,  3.],
        [18.,  0.],
        [19.,  0.],
        [20.,  1.],
        [21.,  0.]])


In [9]:
# Define the model parameters (must match the saved model)
num_features = 2  # Replace with the actual number of features in your dataset
embedding_size = 128  # Replace with the embedding size used during training
hidden_size = 128  # Replace with the hidden size used during training
num_gcn_layers = 3  # Replace with the number of GCN layers used during training
dropout_rate = 0.3  # Replace with the dropout rate used during training


# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "CPU")

# Create an instance of the model
model_pred = GCN(num_features)
# Load the saved state dictionary
model_pred.load_state_dict(torch.load("gcn_cwe_119_test_lr_0005.pth"))
# Move the model to the appropriate device (CPU or GPU)
model_pred = model_pred.to(device)

# Set the model to evaluation mode
model_pred.eval()

GCN(
  (initial_conv): GCNConv(2, 128)
  (conv1): GCNConv(128, 128)
  (conv2): GCNConv(128, 128)
  (lin1): Linear(in_features=256, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=128, bias=True)
  (lin3): Linear(in_features=128, out_features=1, bias=True)
  (act1): ReLU()
  (act2): ReLU()
)