In [None]:
!pip3 install scikit-learn
!pip3 install torch
!pip3 install torch_geometric
!pip3 install matplotlib
!pip3 install tensorflow

In [None]:
# check clang is installed or not
!apt-get install clang

In [None]:
%%bash
git clone --depth=1 --filter=blob:none --sparse https://github.com/LunarArtemis/VulScanner.git
cd VulScanner
git sparse-checkout set Data Model

Cloning into 'VulScanner'...
fetch-pack: unexpected disconnect while reading sideband packet


Process is interrupted.


In [None]:
!mv -r VulScanner/Data VulScanner/Model .

In [None]:
import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm
from torch_geometric.data import Data
from sklearn.metrics import confusion_matrix, f1_score, \
    accuracy_score, precision_score, recall_score
from torch_geometric.data import Dataset
from sklearn.metrics import roc_auc_score

import clang.cindex 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc

from sys import platform

import torch
from torch.nn import Linear, Dropout
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool as gap, global_max_pool as gmp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [None]:
# Configure libclang path
if platform == "linux" or platform == "linux2":
    print("Linux")
    clang.cindex.Config.set_library_file('/usr/lib/llvm-14/lib/libclang.so') # SWU Server   
    # clang.cindex.Config.set_library_file('/usr/local/lib/python3.11/dist-packages/clang/native/libclang.so') # Google Colab
elif platform == "darwin":
    print("OS X")
    clang.cindex.Config.set_library_file('/Library/Developer/CommandLineTools/usr/lib/libclang.dylib') # Mac
elif platform == "win32":
    print("Windows")
    clang.cindex.Config.set_library_file('D:/Project/LLVM/bin/libclang.dll') # Windows 

# Verify if libclang is loaded
print(clang.cindex.Config.loaded)

In [None]:
def save_ast(node):
    """Store the children of the AST node."""
    node.children = list(node.get_children())
    for child in node.children:
        save_ast(child)

def number_ast_nodes(node, counter=1):
    """Assign unique identifiers to each node in the AST."""
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    """Generate an edge list from the AST."""
    edges = [[], []]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges[0].append(node.identifier - 1)
            edges[1].append(child.identifier - 1)
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)

def generate_features(ast_root):
    """Generate node features for the AST."""
    features = []

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        node_id = node.identifier
        features.append([node_id, out_degree])

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)
    return torch.tensor(features, dtype=torch.float)

def clang_process(testcase):
    """Process a test case using Clang to generate AST, edges, and features."""
    parse_list = [(testcase.filename, testcase.code)]

    index = clang.cindex.Index.create()
    translation_unit = index.parse(path=testcase.filename, unsaved_files=parse_list)
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    number_ast_nodes(ast_root)

    edges_embedding = generate_edgelist(ast_root)
    nodes_embedding = generate_features(ast_root)
    
    # Check if it have a vuln or not
    
    
    if hasattr(testcase, "vuln"):
        y = torch.tensor([testcase.vuln], dtype=torch.int64)
    else:
        print("Attribute 'vuln' not found in testcase.")


    # Clean up Clang objects
    del translation_unit, ast_root, index

    return Data(x=nodes_embedding, edge_index=edges_embedding, y=y)

class GenDataset(Dataset):
    def __init__(self, root, csv_path, transform=None, pre_transform=None):
        """
        Args:
            root (str): Root directory where processed data will be stored.
            csv_path (str): Path to the CSV file containing the dataset.
            transform: Optional transform to be applied to the data.
            pre_transform: Optional pre-transform to be applied to the data.
        """
        self.csv_path = csv_path
        super(GenDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return []  # No raw files are needed

    @property
    def processed_file_names(self):
        """Return a list of processed file names."""
        if not hasattr(self, 'data'):
            self.data = pd.read_csv(self.csv_path)
        return [f'data_{i}.pt' for i in range(len(self.data))]

    def download(self):
        """No download needed since we are using a local CSV file."""
        pass

    def process(self):
        """Load the CSV file and process each row into a graph."""
        self.data = pd.read_csv(self.csv_path)
        for index, vuln in tqdm(self.data.iterrows(), total=len(self.data)):
            data = clang_process(vuln)
            torch.save(data, os.path.join(self.processed_dir, f'data_{index}.pt'))

    def len(self):
        """Return the number of graphs in the dataset."""
        return len(self.processed_file_names)

    def get(self, idx):
        """Load a graph from disk."""
        return torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'), weights_only=False)

In [None]:
import torch
from torch.nn import Linear, Dropout
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool as gap, global_max_pool as gmp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# Hyperparameters
embedding_size = 128
dropout_rate = 0.3
learning_rate = 0.001
patience = 5
num_epochs = 100

class GCN(torch.nn.Module):
    def __init__(self,num_features):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # GCN layers
        self.initial_conv = GCNConv(num_features, embedding_size) 
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        

        # Output layer
        self.lin1 = Linear(embedding_size*2, 128) 
        self.lin2 = Linear(128, 128)
        self.lin3 = Linear(128, 1)
        

        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()

    def forward(self, x, edge_index, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.relu(hidden)

        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.relu(hidden)

        hidden = self.conv2(hidden, edge_index)
        hidden = F.relu(hidden)
        
        # Global Pooling (stack different aggregations)
        hidden = torch.cat([gmp(hidden, batch_index), 
                            gap(hidden, batch_index)], dim=1)
        
        # Apply a final (linear) classifier.
        out = self.lin1(hidden)
        out = self.act1(out)
        out = self.lin2(out)
        out = self.act2(out)
        out = self.lin3(out)
        out = torch.sigmoid(out)

        # return out, hidden
        return out

In [None]:
import torch
import time

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the model
model_pred = torch.load("Model/full_gcn_cwe_119_test_lr_0001_p5_best.pth", weights_only=False)
model_pred.to(device)
model_pred.eval()  # Set the model to evaluation mode

class TestCase:
    def __init__(self, filename, code, vuln=False): # Default value for vuln is False
        self.filename = filename
        self.code = code
        self.vuln = vuln

# Load the source code files
with open('Data/nonvuln/no_vuln.c', 'r') as f:
    clear_area_code = f.read()

with open('Data/vuln/cwe_119_1.c', 'r') as f:
    cwe_119_1_code = f.read()

with open('Data/vuln/cwe_119_2.c', 'r') as f:
    cwe_119_2_code = f.read()
  
with open('Data/vuln/cwe_119_3.c', 'r') as f:
    cwe_119_3_code = f.read()

with open('Data/vuln/cwe_119_4.c', 'r') as f:
    cwe_119_4_code = f.read()

# Create test case objects
clear_area_testcase = TestCase(filename='clear_area.c', code=clear_area_code, vuln=False)
vuln_testcase1 = TestCase(filename='cwe_119_1.c', code=cwe_119_1_code, vuln=False)
vuln_testcase2 = TestCase(filename='cwe_119_2.c', code=cwe_119_2_code, vuln=False)
vuln_testcase3 = TestCase(filename='cwe_119_3.c', code=cwe_119_3_code, vuln=False)
vuln_testcase4 = TestCase(filename='cwe_119_4.c', code=cwe_119_4_code, vuln=False)

# Preprocess the source code files
clear_area_data = clang_process(clear_area_testcase)
vuln1_data = clang_process(vuln_testcase1)
vuln2_data = clang_process(vuln_testcase2)
vuln3_data = clang_process(vuln_testcase3)
vuln4_data = clang_process(vuln_testcase4)

# print("clear_area.c features:", clear_area_data.x)
# print("cwe_119_1.c features:", vuln1_data.x)
# print("cwe_119_2.c features:", vuln2_data.x)
# print("cwe_119_3.c features:", vuln3_data.x)

def predict_vulnerability(data, num_runs=5):
    """
    Perform multiple inference runs and measure execution time in milliseconds.
    """
    data = data.to(device)

    times = []
    with torch.no_grad():
        for _ in range(num_runs):
            start_time = time.time()  # Start time

            output = model_pred(data.x.float(), data.edge_index, data.batch) # Saved train model
            end_time = time.time()  # End time

            elapsed_time_ms = (end_time - start_time) * 1000  # Convert to milliseconds
            times.append(elapsed_time_ms)

        probability = output.item()  # Get probability
        prediction = torch.round(output).item()  # Round to 0 or 1

    avg_time_ms = sum(times) / num_runs  # Compute average inference time in ms
    return probability, prediction, avg_time_ms

# Perform inference with time measurement in milliseconds
tmpProb, tmpPred, tmpTime = predict_vulnerability(clear_area_data)
print(f"clear_area.c | Probability: {tmpProb:.4f} | Prediction: {'Vulnerable' if tmpPred == 1 else 'Safe'} | Avg. Time: {tmpTime:.4f} ms")
del tmpProb, tmpPred, tmpTime

tmpProb, tmpPred, tmpTime = predict_vulnerability(vuln1_data)
print(f"cwe_119_1.c | Probability: {tmpProb:.4f} | Prediction: {'Vulnerable' if tmpPred == 1 else 'Safe'} | Avg. Time: {tmpTime:.4f} ms")
del tmpProb, tmpPred, tmpTime

tmpProb, tmpPred, tmpTime = predict_vulnerability(vuln2_data)
print(f"cwe_119_2.c | Probability: {tmpProb:.4f} | Prediction: {'Vulnerable' if tmpPred == 1 else 'Safe'} | Avg. Time: {tmpTime:.4f} ms") 
del tmpProb, tmpPred, tmpTime

tmpProb, tmpPred, tmpTime = predict_vulnerability(vuln3_data)
print(f"cwe_119_3.c | Probability: {tmpProb:.4f} | Prediction: {'Vulnerable' if tmpPred == 1 else 'Safe'} | Avg. Time: {tmpTime:.4f} ms")
del tmpProb, tmpPred, tmpTime

tmpProb, tmpPred, tmpTime = predict_vulnerability(vuln4_data)
print(f"cwe_119_4.c | Probability: {tmpProb:.4f} | Prediction: {'Vulnerable' if tmpPred == 1 else 'Safe'} | Avg. Time: {tmpTime:.4f} ms")

In [None]:
import time

def measure_avg_time(data_samples, num_runs=10):
    """Measures average inference time over multiple runs."""
    total_time = 0.0
    for _ in range(num_runs):
        start = time.perf_counter()
        predict_vulnerability(data_samples)  # We ignore output for timing
        end = time.perf_counter()
        total_time += (end - start) * 1000  # Convert to milliseconds
    return total_time / num_runs

# Measure average time for each
avg_time_clear = measure_avg_time(clear_area_data)
avg_time_vuln = measure_avg_time(vuln1_data)

# Combined average
overall_avg_time = (avg_time_clear + avg_time_vuln) / 2

print(f"clear_area.c - Avg Time: {avg_time_clear:.3f} ms (10 runs)")
print(f"cwe_119_1.c - Avg Time: {avg_time_vuln:.3f} ms (10 runs)")
print(f"\nOverall Average Inference Time: {overall_avg_time:.3f} ms")