In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
#get labels
data = pd.read_csv("Datasets/Normalized_CWE-469.csv")
#get embeddings 
x= pd.read_csv("graph2vec/features/embledding_cwe_469.csv")

In [4]:
data["vuln"] = data["vuln"].astype(int)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,testID,filename,code,vuln,type
0,0,0,cwe469_0.c,"gretl_list_build (const char *s, const DATASET...",0,CWE-469
1,1,1,cwe469_1.c,rd_meta_is_broken(FILE *fp)\n{\n char buf[M...,1,CWE-469
2,2,2,cwe469_2.c,"load( f_ck_query query_func, t_CKBOOL lazy )\n...",0,CWE-469
3,3,3,cwe469_3.c,checkSupGroups (LDAP * ld)\n{\n LDAPMessage *...,1,CWE-469
4,4,4,cwe469_4.c,"dht_getxattr_unwind (call_frame_t *frame,\n ...",0,CWE-469


In [6]:
indices = x['type'].values
y = data.iloc[:, 4]
y_labels = y.iloc[indices]
y_labels.head()

0       0
1       1
10      0
100     0
1000    1
Name: vuln, dtype: int64

In [7]:
x = x.drop(columns="type")
x.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_118,x_119,x_120,x_121,x_122,x_123,x_124,x_125,x_126,x_127
0,-0.08622,-0.183489,-0.311828,0.210561,-0.011794,0.046493,-0.247115,0.002444,-0.054876,0.163939,...,0.136654,0.208798,0.11881,0.194054,0.098417,-0.000896,-0.060261,0.012516,0.089105,-0.105244
1,0.050206,-0.533095,-0.311225,0.058918,-0.018652,-0.024216,-0.350888,-0.090321,0.005125,0.20089,...,0.236584,0.052393,-0.022136,0.186589,0.007961,-0.023393,0.154599,-0.031407,-0.102472,-0.18172
2,-0.02395,-0.167754,-0.20061,0.1418,-0.017827,-0.013925,-0.161392,-0.095699,0.004724,0.130106,...,0.096194,0.076208,0.018431,0.087224,0.073983,-0.024665,-0.001303,-0.029866,0.06274,-0.076197
3,-0.008526,-0.158457,-0.380494,0.172831,0.048501,-0.012732,-0.286329,-0.010031,-0.268125,0.163671,...,0.191623,0.287033,-0.020448,0.148487,0.04043,-0.068902,-0.027892,0.025348,0.106796,-0.069266
4,0.005788,-0.437232,-0.307672,0.062768,-0.005194,-0.015667,-0.310069,-0.05755,-0.023266,0.175896,...,0.232327,0.029678,-0.008981,0.134089,0.028003,-0.011975,0.156055,-0.036535,-0.023201,-0.182217


In [8]:
dataset = pd.concat([x, y_labels], axis=1)

In [9]:
dataset.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_119,x_120,x_121,x_122,x_123,x_124,x_125,x_126,x_127,vuln
0,-0.08622,-0.183489,-0.311828,0.210561,-0.011794,0.046493,-0.247115,0.002444,-0.054876,0.163939,...,0.208798,0.11881,0.194054,0.098417,-0.000896,-0.060261,0.012516,0.089105,-0.105244,0
1,0.050206,-0.533095,-0.311225,0.058918,-0.018652,-0.024216,-0.350888,-0.090321,0.005125,0.20089,...,0.052393,-0.022136,0.186589,0.007961,-0.023393,0.154599,-0.031407,-0.102472,-0.18172,1
2,-0.02395,-0.167754,-0.20061,0.1418,-0.017827,-0.013925,-0.161392,-0.095699,0.004724,0.130106,...,0.076208,0.018431,0.087224,0.073983,-0.024665,-0.001303,-0.029866,0.06274,-0.076197,0
3,-0.008526,-0.158457,-0.380494,0.172831,0.048501,-0.012732,-0.286329,-0.010031,-0.268125,0.163671,...,0.287033,-0.020448,0.148487,0.04043,-0.068902,-0.027892,0.025348,0.106796,-0.069266,1
4,0.005788,-0.437232,-0.307672,0.062768,-0.005194,-0.015667,-0.310069,-0.05755,-0.023266,0.175896,...,0.029678,-0.008981,0.134089,0.028003,-0.011975,0.156055,-0.036535,-0.023201,-0.182217,0


In [12]:
example = dataset.iloc[0]
print(example.vuln)

0.0


In [13]:
y = dataset['vuln']

In [14]:
# Splitting the dataset into the Training set and Test set
# Into 80% training and 10% testing and 10% validation
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

In [16]:
print("X_train length: ", len(X_train))
print("X_test length: ", len(X_test))
print("X_val length: ", len(X_val))

X_train length:  4252
X_test length:  525
X_val length:  473


In [20]:
# Convert to tensor
X_train_tensors = torch.FloatTensor(X_train.values)
X_test_tensors = torch.FloatTensor(X_test.values)
X_val_tensors = torch.FloatTensor(X_val.values)

In [21]:
example = X_train_tensors[0]
print(example)

tensor([ 0.0962, -0.2087, -0.1513,  0.1960,  0.0422,  0.0698, -0.2391, -0.3496,
         0.0520,  0.1334, -0.2362,  0.1998,  0.1078, -0.3431,  0.0169, -0.0645,
         0.1341,  0.1785, -0.1672, -0.1491, -0.2454,  0.1709, -0.1057,  0.0595,
         0.0792, -0.0384, -0.3708, -0.1085,  0.0331, -0.3148, -0.1245,  0.1213,
        -0.0486, -0.0577,  0.2229,  0.1514,  0.0426, -0.0026, -0.0393, -0.2105,
         0.0204, -0.1162, -0.2300, -0.2593, -0.0649,  0.0251,  0.1276, -0.0036,
        -0.0604,  0.0435,  0.0796, -0.0301, -0.3028,  0.1963,  0.0399,  0.2024,
        -0.0506, -0.0663,  0.0906, -0.2888, -0.2172,  0.2024, -0.0976, -0.0941,
         0.3093,  0.2353, -0.1213,  0.0080, -0.1196, -0.2319, -0.1507,  0.0929,
        -0.2444, -0.2500, -0.1284, -0.0671, -0.1139,  0.0286, -0.2387, -0.0669,
        -0.2107,  0.0153, -0.1803,  0.1660, -0.0395, -0.0891,  0.1080,  0.0607,
         0.0361,  0.0348, -0.2703,  0.4720, -0.3695, -0.0428, -0.2394,  0.1188,
        -0.0222, -0.0077,  0.0362,  0.16

In [26]:
y_train_tensors = torch.tensor(y_train.values)
y_test_tensors = torch.tensor(y_test.values)
y_val_tensors = torch.tensor(y_val.values)

In [27]:
example = y_train_tensors[399]
print(example)

tensor(1)


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Fix target tensor type
y_train_tensors = torch.tensor(y_train.values, dtype=torch.float32)
y_val_tensors = torch.tensor(y_val.values, dtype=torch.float32)
y_test_tensors = torch.tensor(y_test.values, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensors, y_train_tensors.unsqueeze(1))
val_dataset = TensorDataset(X_val_tensors, y_val_tensors.unsqueeze(1))
test_dataset = TensorDataset(X_test_tensors, y_test_tensors.unsqueeze(1))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# **Improved MLP Model**
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),  # Normalize input
            nn.ReLU(),
            nn.Dropout(0.3),  # Increase dropout
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Output for binary classification
        )

    def forward(self, x):
        return self.model(x)

# Initialize Model
input_dim = X_train_tensors.shape[1]
mlp_model = MLP(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001, weight_decay=1e-4)  # L2 Regularization

# **Training Function**
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        val_loss = evaluate_model(model, val_loader, criterion)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")

# **Evaluation Function**
def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
    return total_loss / len(loader)

# **Train & Test Model**
train_model(mlp_model, train_loader, val_loader, criterion, optimizer, epochs=20)
test_loss = evaluate_model(mlp_model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}")

Epoch [1/20], Loss: 0.6949, Val Loss: 0.6992
Epoch [2/20], Loss: 0.6926, Val Loss: 0.6976
Epoch [3/20], Loss: 0.6895, Val Loss: 0.6970
Epoch [4/20], Loss: 0.6882, Val Loss: 0.6986
Epoch [5/20], Loss: 0.6876, Val Loss: 0.6941
Epoch [6/20], Loss: 0.6858, Val Loss: 0.7051
Epoch [7/20], Loss: 0.6844, Val Loss: 0.7050
Epoch [8/20], Loss: 0.6813, Val Loss: 0.7042
Epoch [9/20], Loss: 0.6823, Val Loss: 0.6990
Epoch [10/20], Loss: 0.6791, Val Loss: 0.7076
Epoch [11/20], Loss: 0.6780, Val Loss: 0.7107
Epoch [12/20], Loss: 0.6758, Val Loss: 0.7070
Epoch [13/20], Loss: 0.6727, Val Loss: 0.7128
Epoch [14/20], Loss: 0.6724, Val Loss: 0.7147
Epoch [15/20], Loss: 0.6671, Val Loss: 0.7159
Epoch [16/20], Loss: 0.6740, Val Loss: 0.7131
Epoch [17/20], Loss: 0.6668, Val Loss: 0.7152
Epoch [18/20], Loss: 0.6657, Val Loss: 0.7082
Epoch [19/20], Loss: 0.6628, Val Loss: 0.7149
Epoch [20/20], Loss: 0.6631, Val Loss: 0.7181
Test Loss: 0.7241
