In [41]:
from google.colab import drive
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Step 1: Install and Import Libraries


In [42]:
!pip install transformers

import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch




# **Step2: Load SQL Injection and Other Vulnerabilities Datasets**

In [43]:
import pandas as pd
from sklearn.utils import shuffle

# Load the SQL Injection dataset
sql_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Train/sql_injection2.0.csv')
# Load the Other Vulnerabilities dataset (for XSS, CSRF, RCE)
other_vuln_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Train/Others_vuln2.0.csv')

# Shuffle the datasets
sql_data_shuffled = shuffle(sql_data)
other_vuln_data_shuffled = shuffle(other_vuln_data)

# Verify the datasets
print("SQL Injection Dataset:")
print(sql_data_shuffled.head())

print("\nOther Vulnerabilities Dataset:")
print(other_vuln_data_shuffled.head())

# Print dataset columns
print("\nSQL Injection Dataset Columns:", sql_data_shuffled.columns)
print("Other Vulnerabilities Dataset Columns:", other_vuln_data_shuffled.columns)


SQL Injection Dataset:
      ID                                          SQL Query Injection Payload  \
521  522         UPDATE accounts SET balance=500 WHERE id=1               NaN   
737  738  DELETE FROM orders WHERE order_id = john.doe@e...               NaN   
740  741         UPDATE accounts SET balance=500 WHERE id=2               NaN   
660  661     UPDATE accounts SET balance=500 WHERE id=apple               NaN   
411  412  SELECT email FROM customers WHERE email = '' O...           1=1;--'   

    Injection Type Vulnerability Status     Injection Result Attack Type  
521            NaN                   No                  NaN      Normal  
737            NaN                   No                  NaN      Normal  
740            NaN                   No                  NaN      Normal  
660            NaN                   No                  NaN      Normal  
411    Union-based                  Yes  Unauthorized access      Attack  

Other Vulnerabilities Dataset:
     Unn

# Step 3: Tokenize Both Datasets Using CodeBERT

In [44]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

# Function to tokenize data from multiple columns and combine input_ids
def tokenize_data(dataframe, text_columns):
    # Assuming concatenation of texts from different columns for tokenization
    concatenated_texts = dataframe[text_columns].apply(lambda x: ' '.join(x.dropna().values), axis=1)
    return tokenizer(concatenated_texts.tolist(), padding="max_length", truncation=True, max_length=512, return_tensors='pt')

# Columns to tokenize - choose how you concatenate based on your model's capacity to handle input
sql_columns = ['SQL Query', 'Vulnerability Status']
other_vuln_columns = ['Attack Vector', 'Vulnerability Status']

# Tokenize data
sql_encoded = tokenize_data(sql_data, sql_columns)
other_vuln_encoded = tokenize_data(other_vuln_data, other_vuln_columns)

# Printing to verify the outputs
print("SQL Encoded Data:")
print(f"Input IDs: {sql_encoded['input_ids'].shape}")
print(f"Attention Masks: {sql_encoded['attention_mask'].shape}")

print("\nOther Vulnerabilities Encoded Data:")
print(f"Input IDs: {other_vuln_encoded['input_ids'].shape}")
print(f"Attention Masks: {other_vuln_encoded['attention_mask'].shape}")



SQL Encoded Data:
Input IDs: torch.Size([1000, 512])
Attention Masks: torch.Size([1000, 512])

Other Vulnerabilities Encoded Data:
Input IDs: torch.Size([738, 512])
Attention Masks: torch.Size([738, 512])


# Step 5: Create a PyTorch Dataset Class and Split Both Datasets for Training and Validation
This class converts the tokenized data into a format compatible with PyTorch’s Dataset.

In [45]:
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}  # Correct way to handle tensor copying
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

class CodeDatasetOther(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [float(label) for label in labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Assuming sql_encoded and other_vuln_encoded are already prepared using your tokenization function
sql_labels = [1 if x == 'Yes' else 0 for x in sql_data['Vulnerability Status']]
other_labels = [1 if x == 'Yes' else 0 for x in other_vuln_data['Vulnerability Status']]

# Create dataset instances
sql_dataset = CustomDataset(sql_encoded, sql_labels)
other_vuln_dataset = CodeDatasetOther(other_vuln_encoded, other_labels)

# Combine the two datasets
combined_dataset = ConcatDataset([sql_dataset, other_vuln_dataset])

# Define the split sizes
train_size = int(0.8 * len(combined_dataset))  # 80% for training
val_size = len(combined_dataset) - train_size  # Remaining 20% for validation

# Split the combined dataset into training and validation sets
train_dataset, val_dataset = random_split(combined_dataset, [train_size, val_size])

# Create DataLoader instances for the training and validation sets
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Example loop to illustrate how to handle data loader outputs
for batch in train_loader:
    print(batch)  # This should print the batches without warnings
    break

# asasxasxasxxasxa

{'input_ids': tensor([[    0, 49179,  1009,  ...,     1,     1,     1],
        [    0, 41552, 32761,  ...,     1,     1,     1],
        [    0, 49179,  1047,  ...,     1,     1,     1],
        ...,
        [    0, 10089,  3850,  ...,     1,     1,     1],
        [    0, 47060,  2069,  ...,     1,     1,     1],
        [    0, 34543,  2349,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.])}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
# # Assuming sql_dataset and other_vuln_dataset are your datasets prepared from previous steps
# train_sql, val_sql = train_test_split(sql_dataset, test_size=0.2, random_state=42)
# train_other, val_other = train_test_split(other_vuln_dataset , test_size=0.2, random_state=42)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [46]:

import pandas as pd
sql_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Train/sql_injection2.0.csv')
other_vuln_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Train/Others_vuln2.0.csv')
vulnerability_counts = other_vuln_data['Attack Type'].value_counts()
print("Vulnerability Type Counts:\n", vulnerability_counts)

Vulnerability Type Counts:
 Attack Type
CSRF    273
XSS     233
RCE     232
Name: count, dtype: int64


# Step 6: Initialize CodeBERT Models for Both Datasets****

In [None]:
# # Define model here
# model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', num_labels=2)

# Train

In [48]:
!pip install seqeval



# IoU Calculation Function

# Compute_metrics Function

In [49]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p

    # Get probabilities for class 1
    predictions = predictions[:, 1] # Use probabilities for positive class

    # Convert labels to floats
    labels = labels.astype(float)

    # Calculate precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions.round(), average='binary')

    # Calculate AUC-ROC
    auc_roc = roc_auc_score(labels, predictions)

    return {
        "accuracy": accuracy_score(labels, predictions.round()),
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc_roc": auc_roc
    }

In [51]:
from transformers import Trainer, TrainingArguments
from transformers import RobertaForSequenceClassification

# Define model here for binary classification
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', num_labels=1)  # Change to 1 for binary

# Setup training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
)

# Initialize the Trainer for the combined model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the combined training dataset
    eval_dataset=val_dataset,      # Use the combined validation dataset
    compute_metrics=compute_metrics  # Using the revised compute_metrics function
)

# Start training
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss


RuntimeError: Found dtype Long but expected Float

In [None]:
# # Train both models
# trainer_.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc Roc
1,0.0066,0.001594,1.0,1.0,1.0,1.0,1.0
2,0.0006,0.000222,1.0,1.0,1.0,1.0,1.0
3,0.0002,8.7e-05,1.0,1.0,1.0,1.0,1.0


TrainOutput(global_step=339, training_loss=0.14586889224640673, metrics={'train_runtime': 269.4661, 'train_samples_per_second': 10.02, 'train_steps_per_second': 1.258, 'total_flos': 710399849472000.0, 'train_loss': 0.14586889224640673, 'epoch': 3.0})

In [None]:
# trainer_other.train()

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

RESULT of Train

RESULT of Train

In [None]:
model_path_sql = './model_sql'
model_path_other = './model_other'
trainer_sql.model.save_pretrained(model_path_sql)
trainer_other.model.save_pretrained(model_path_other)

# Build a Scraping

Predicting the the vulnerbility

In [None]:
!pip install transformers # Ensure transformers is installed.
from transformers import RobertaTokenizer, RobertaForSequenceClassification # Import the RobertaTokenizer and RobertaForSequenceClassification classes
import requests
from bs4 import BeautifulSoup
import re
import torch # Import torch

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
sql_model = RobertaForSequenceClassification.from_pretrained('/content/model_sql')
other_vuln_model = RobertaForSequenceClassification.from_pretrained('/content/model_sql')


def scrape_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    html_content = str(soup)
    script_content = [script.text for script in soup.find_all('script') if script.text]
    hidden_inputs = soup.find_all('input', {'type': 'hidden'})
    data_attributes = [tag for tag in soup.find_all(attrs=re.compile(r'^data-'))]
    sql_candidates = []

    for input_tag in hidden_inputs:
        if 'sql' in input_tag.get('value', '').lower():
            sql_candidates.append(input_tag['value'])

    for data_tag in data_attributes:
        for attribute, value in data_tag.attrs.items():
            if 'sql' in attribute.lower() and isinstance(value, str):
                sql_candidates.append(value)

    return html_content, script_content, sql_candidates


def suggest_fixes(sql_prediction, other_vuln_prediction, script_content):
  """Provides code suggestions based on detected vulnerabilities."""
  suggestions = []
  if sql_prediction == 1:
    suggestions.append(
        "**SQL Injection Detected:**\n"
        "**Suggestion:** Sanitize user inputs before using them in SQL queries.\n"
        "**Example:** Use parameterized queries or escape special characters."
    )

  if other_vuln_prediction == 1:
    suggestions.append(
        "**Other Vulnerability (XSS, CSRF, RCE) Detected:**\n"
        "**Suggestion:** Investigate the script_content for potential vulnerable code.\n"
        "**Possible fixes:**\n"
        " - **XSS:** Encode user input before displaying it.\n"
        " - **CSRF:** Use anti-CSRF tokens.\n"
        " - **RCE:** Validate user input to prevent code execution."
    )


  if not suggestions:
    suggestions.append("No vulnerabilities detected.")

  return suggestions


def predict_and_suggest(url):
    """Predicts vulnerabilities and suggests fixes for a given URL."""
    html_content, script_content, sql_candidates = scrape_webpage(url) # scrape_webpage is now defined
    encoded_input = tokenizer(script_content, return_tensors="pt", truncation=True, padding=True)
    sql_outputs = sql_model(**encoded_input)
    other_vuln_outputs = other_vuln_model(**encoded_input)
    sql_predictions = torch.argmax(sql_outputs.logits, dim=-1)
    other_vuln_predictions = torch.argmax(other_vuln_outputs.logits, dim=-1)

    suggestions = suggest_fixes(sql_predictions.item(), other_vuln_predictions.item(), script_content)

    return suggestions


# Example Usage
url = "https://www.facebook.com/"  # Replace with the target URL
suggestions = predict_and_suggest(url)

for suggestion in suggestions:
  print(suggestion)

In [None]:
!pip install transformers # Ensure transformers is installed.

In [None]:
!pip install transformers # Ensure transformers is installed.
from transformers import RobertaTokenizer, RobertaForSequenceClassification # Import the RobertaTokenizer and RobertaForSequenceClassification classes

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
sql_model = RobertaForSequenceClassification.from_pretrained('/content/model_sql')
other_vuln_model = RobertaForSequenceClassification.from_pretrained('/content/model_sql')


def suggest_fixes(sql_prediction, other_vuln_prediction, script_content):
  """Provides code suggestions based on detected vulnerabilities."""
  suggestions = []
  if sql_prediction == 1:
    suggestions.append(
        "**SQL Injection Detected:**\n"
        "**Suggestion:** Sanitize user inputs before using them in SQL queries.\n"
        "**Example:** Use parameterized queries or escape special characters."
    )

  if other_vuln_prediction == 1:
    suggestions.append(
        "**Other Vulnerability (XSS, CSRF, RCE) Detected:**\n"
        "**Suggestion:** Investigate the script_content for potential vulnerable code.\n"
        "**Possible fixes:**\n"
        " - **XSS:** Encode user input before displaying it.\n"
        " - **CSRF:** Use anti-CSRF tokens.\n"
        " - **RCE:** Validate user input to prevent code execution."
    )


  if not suggestions:
    suggestions.append("No vulnerabilities detected.")

  return suggestions


def predict_and_suggest(url):
    """Predicts vulnerabilities and suggests fixes for a given URL."""
    html_content, script_content, sql_candidates = scrape_webpage(url)
    encoded_input = tokenizer(script_content, return_tensors="pt", truncation=True, padding=True)
    sql_outputs = sql_model(**encoded_input)
    other_vuln_outputs = other_vuln_model(**encoded_input)
    sql_predictions = torch.argmax(sql_outputs.logits, dim=-1)
    other_vuln_predictions = torch.argmax(other_vuln_outputs.logits, dim=-1)

    suggestions = suggest_fixes(sql_predictions.item(), other_vuln_predictions.item(), script_content)

    return suggestions


# Example Usage
url = "https://www.facebook.com/"  # Replace with the target URL
suggestions = predict_and_suggest(url)

for suggestion in suggestions:
  print(suggestion)