In [None]:
import pandas as pd
import numpy as np
import re

# Load the Dataset
data = pd.read_csv('/content/XSS_dataset.csv')

# Display the number of rows before processing
print(f"Number of rows before processing: {data.shape[0]}")

# Count labels before processing
label_counts_before = data['Label'].value_counts()
print(f"Malicious (1) before processing: {label_counts_before.get(1, 0)}")
print(f"Benign (0) before processing: {label_counts_before.get(0, 0)}")

# Data Preprocessing
# Check for duplicate rows based on Sentence and Label
duplicates = data.duplicated(subset=['Sentence', 'Label']).sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicate rows based on Sentence and Label
data.drop_duplicates(subset=['Sentence', 'Label'], inplace=True)

# Check if any same query has different labels
conflicting_labels = data.duplicated(subset=['Sentence'], keep=False).sum()
print(f"Number of queries with conflicting labels: {conflicting_labels}")

# Remove conflicting rows where the same query has different labels
data.drop_duplicates(subset=['Sentence'], keep=False, inplace=True)

# Check for null values
null_values = data.isnull().sum().sum()
print(f"Number of null values: {null_values}")

# Remove null values
data.dropna(inplace=True)

# Display the number of rows after processing
print(f"Number of rows after processing: {data.shape[0]}")

# Count labels after processing
label_counts_after = data['Label'].value_counts()
print(f"Malicious (1) after processing: {label_counts_after.get(1, 0)}")
print(f"Benign (0) after processing: {label_counts_after.get(0, 0)}")

# Feature Engineering

# Length of a payload
data['Length'] = data['Sentence'].apply(lambda x: len(x))

# Number of HTML tags in a payload
def count_tags(x):
    return len(re.findall(r"<.*?>", x))
data['Tag_Count'] = data['Sentence'].apply(count_tags)

# Number of special characters in a payload
def count_special_chars(x):
    return len(re.findall(r'[<>"/]', x))
data['Special_Char_Count'] = data['Sentence'].apply(count_special_chars)

# Number of JavaScript keywords in a payload
def count_js_keywords(x):
    js_keywords = ['script', 'alert', 'onload', 'onmouseover']
    return sum(keyword in x.lower() for keyword in js_keywords)
data['JS_Keyword_Count'] = data['Sentence'].apply(count_js_keywords)

# Save the new dataset
data.to_csv("/content/XSS_enhanced_dataset.csv", index=False)

# Display the first few rows
print(data.head())


Number of rows before processing: 13686
Malicious (1) before processing: 7373
Benign (0) before processing: 6313
Number of duplicate rows: 2769
Number of queries with conflicting labels: 0
Number of null values: 0
Number of rows after processing: 10917
Malicious (1) after processing: 7323
Benign (0) after processing: 3594
   Unnamed: 0                                           Sentence  Label  \
0           0  <li><a href="/wiki/File:Socrates.png" class="i...      0   
1           1               <tt onmouseover="alert(1)">test</tt>      1   
2           2  \t </span> <span class="reference-text">Steeri...      0   
3           3  \t </span> <span class="reference-text"><cite ...      0   
4           4  \t </span>. <a href="/wiki/Digital_object_iden...      0   

   Length  Tag_Count  Special_Char_Count  JS_Keyword_Count  
0     557          8                  78                 0  
1      36          2                   7                 2  
2     233          4                  24  