# NEW

In [2]:
from __future__ import print_function

import warnings
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, LeakyReLU
from keras.optimizers import Adamax
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import os
import pickle

current_path = os.getcwd()
print(current_path)
warnings.filterwarnings("ignore")

/tf-knugs/werrayuth/VulScanner


In [3]:
# Load dataset
# df = pd.read_csv('/work/JS/cwe-79.csv', encoding='UTF-8', usecols=['code', 'label'])

In [10]:
df = pd.read_csv(current_path + '/Datasets/XSS_dataset.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13686 entries, 0 to 13685
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13686 non-null  int64 
 1   Sentence    13686 non-null  object
 2   Label       13686 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 320.9+ KB


In [12]:
# Clean dataset
# Remove first column
df = df.iloc[:, 1:]
df.head(5)

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [13]:
# Get only 1000 records
# df = df.head(2000)

In [14]:
# Rename columns
df.columns = ['code', 'label']
df.head(5)

Unnamed: 0,code,label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [15]:
class BLSTM:
    def __init__(self, data, name="", batch_size=64, epochs=4):
        self.data = data
        self.name = name
        self.batch_size = batch_size
        self.epochs = epochs

        # Preprocess data
        self._preprocess_data()

        # Build and compile model
        self.model = self._build_model()

    def _preprocess_data(self):
        # Preprocess labels
        label_encoder = LabelEncoder()
        self.data['label'] = label_encoder.fit_transform(self.data['label'])
        self.num_classes = len(self.data['label'].unique())  # Number of classes

        # Tokenize code snippets
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.data['code'])
        sequences = tokenizer.texts_to_sequences(self.data['code'])
        self.X = pad_sequences(sequences)

        # Convert labels to one-hot encoding
        self.y = np.eye(self.num_classes)[self.data['label'].values]

        # Split dataset
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42
        )

        # Save tokenizer and label encoder
        with open('tokenizer.pkl', 'wb') as file:
            pickle.dump(tokenizer, file)
        with open('label_encoder.pkl', 'wb') as file:
            pickle.dump(label_encoder, file)

        # Check data shapes
        print(f'Vocabulary size: {len(tokenizer.word_index) + 1}')
        print('Padded sequences shape:', self.X.shape)
        print(f'Number of classes: {self.num_classes}')
        print(f'X_train shape: {self.X_train.shape}')
        print(f'X_test shape: {self.X_test.shape}')
        print(f'y_train shape: {self.y_train.shape}')
        print(f'y_test shape: {self.y_test.shape}')

    def _build_model(self):
        model = Sequential()
        model.add(Bidirectional(LSTM(300, return_sequences=True), input_shape=(self.X_train.shape[1], 1)))
        model.add(Dropout(0.5))
        model.add(LSTM(300))
        model.add(Dropout(0.5))
        model.add(Dense(300))
        model.add(LeakyReLU())
        model.add(Dense(self.num_classes, activation='softmax'))

        model.compile(optimizer=Adamax(learning_rate=0.002),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    def train(self):
        # Fit model
        self.model.fit(
            self.X_train, self.y_train,
            batch_size=self.batch_size,
            epochs=self.epochs
        )
        self.model.save_weights(self.name + ".weights.h5")
        self.model.save(self.name + "_model.h5")

    def test(self):
        # Load model weights
        self.model.load_weights(self.name + ".weights.h5")
        results = self.model.evaluate(self.X_test, self.y_test)
        print("Test loss:", results[0])
        print("Test accuracy:", results[1])

        # Predict and evaluate
        predictions = self.model.predict(self.X_test)
        y_pred = np.argmax(predictions, axis=1)
        y_true = np.argmax(self.y_test, axis=1)

        print('Confusion Matrix:\n', confusion_matrix(y_true, y_pred))
        print('Classification Report:\n', classification_report(y_true, y_pred))

# Example usage:
# df = pd.read_csv('your_dataset.csv')  # Load your dataset
# blstm = BLSTM(df, name="test_blstm_model")
# blstm.train()
# blstm.test()

In [16]:
blstm = BLSTM(df, name="test_blstm", batch_size=16, epochs=20)
blstm.train()

Vocabulary size: 9240
Padded sequences shape: (13686, 838)
Number of classes: 2
X_train shape: (10948, 838)
X_test shape: (2738, 838)
y_train shape: (10948, 2)
y_test shape: (2738, 2)


I0000 00:00:1728439924.449777   13537 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728439924.785335   13537 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728439924.785379   13537 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728439924.812312   13537 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728439924.812378   13537 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

Epoch 1/20


2024-10-09 02:12:13.048031: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 171ms/step - accuracy: 0.9249 - loss: 0.1721
Epoch 2/20
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 172ms/step - accuracy: 0.9872 - loss: 0.0378
Epoch 3/20
[1m 70/685[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:50[0m 180ms/step - accuracy: 0.9853 - loss: 0.0329

KeyboardInterrupt: 

In [11]:
 blstm.test()

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 247ms/step - accuracy: 0.9791 - loss: 0.0490
Test loss: 0.04779582843184471
Test accuracy: 0.9799123406410217
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 213ms/step
Confusion Matrix:
 [[1254    6]
 [  49 1429]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1260
           1       1.00      0.97      0.98      1478

    accuracy                           0.98      2738
   macro avg       0.98      0.98      0.98      2738
weighted avg       0.98      0.98      0.98      2738



# Cleaner

In [None]:
from bs4 import BeautifulSoup
import re

def clean_html_content(html_content):
    """
    Cleans the HTML content by removing tags and extracting relevant text and code.
    
    Parameters:
    html_content (str): The raw HTML content as a string.
    
    Returns:
    str: The cleaned text with HTML tags removed.
    """
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract script contents
    script_contents = [script.get_text() for script in soup.find_all('script')]
    
    # Extract visible text contents
    text = soup.get_text()

    # Combine script content with the visible text
    combined_content = ' '.join(script_contents + [text])
    
    # Normalize whitespace (remove extra spaces and newlines)
    combined_content = re.sub(r'\s+', ' ', combined_content).strip()

    return combined_content

# Predict

In [None]:
class VulnerabilityPredictor:
    def __init__(self, model_architecture, weights_path, tokenizer_path, label_encoder_path, max_sequence_length):
        # Create the model with the specified architecture
        self.model = self._create_model(model_architecture)

        # Build model by calling it with a dummy input
        dummy_input = np.zeros((1, model_architecture['input_shape'][0], model_architecture['input_shape'][1]))
        self.model(dummy_input)

        # Load model weights
        self.model.load_weights(weights_path)

        # Load the tokenizer
        with open(tokenizer_path, 'rb') as file:
            self.tokenizer = pickle.load(file)

        # Load the label encoder
        with open(label_encoder_path, 'rb') as file:
            self.label_encoder = pickle.load(file)

        self.max_sequence_length = max_sequence_length

    def _create_model(self, architecture):
        model = Sequential()
        model.add(Bidirectional(LSTM(300, return_sequences=True, input_shape=architecture['input_shape'])))
        model.add(Dropout(0.5))
        model.add(LSTM(300))
        model.add(Dropout(0.5))
        model.add(Dense(300))
        model.add(LeakyReLU())
        model.add(Dense(architecture['num_classes'], activation='softmax'))
        model.compile(optimizer='adamax', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def preprocess_source_code(self, file_path):
        # Read the source code file
        with open(file_path, 'r') as file:
            code = file.read()

        # Clean the HTML content
        cleaned_code = clean_html_content(code)
        print(f"Cleaned code: {cleaned_code}")

        # Tokenize and pad the code
        sequences = self.tokenizer.texts_to_sequences([cleaned_code])
        padded_sequences = pad_sequences(sequences, maxlen=self.max_sequence_length)

        return padded_sequences

    def predict(self, file_path):
        # Preprocess source code file
        padded_sequences = self.preprocess_source_code(file_path)

        # Predict vulnerabilities
        prediction = self.model.predict(padded_sequences)
        print(f"Raw prediction scores: {prediction}")

        predicted_class = np.argmax(prediction, axis=1)[0]

        # Map class index to label
        predicted_label = self.label_encoder.inverse_transform([predicted_class])[0]

        return predicted_label

# Example usage:
model_architecture = {
    'input_shape': (606, 1),  # Adjust this based on your model
    'num_classes': 2          # Number of classes
}
weights_path = "test_blstm_model_weights_model.h5"  # Path to your model weights
tokenizer_path = "tokenizer.pkl"  # Path to your tokenizer file
label_encoder_path = "label_encoder.pkl"  # Path to your label encoder file
max_sequence_length = 606  # Maximum sequence length used during training

predictor = VulnerabilityPredictor(model_architecture, weights_path, tokenizer_path, label_encoder_path, max_sequence_length)

# Predict vulnerabilities in a source code file
file_path = current_path +  "/SourceCode/vuln.html"
prediction = predictor.predict(file_path)
print("Predicted vulnerability:", prediction)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f5982bd9-4186-4df6-a44f-a0eeec8cd72d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>