In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [25]:
df = pd.read_csv("XSS_dataset.csv")

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [27]:
df.shape

(13686, 3)

In [28]:
print(df['Label'].value_counts())


Label
1    7373
0    6313
Name: count, dtype: int64


In [29]:
print("Total rows:", len(df))
print("Unique rows:", len(df.drop_duplicates()))



Total rows: 13686
Unique rows: 13686


In [31]:
from collections import Counter

malicious_words = " ".join(df[df['Label']==1]['Sentence']).split()
safe_words = " ".join(df[df['Label']==0]['Sentence']).split()

print("Top tokens in malicious:", Counter(malicious_words).most_common(20))
print("Top tokens in safe:", Counter(safe_words).most_common(20))


Top tokens in malicious: [('id=x', 1420), ('tabindex=1', 1101), ('draggable="true"', 987), ('autofocus>', 575), ('<style>@keyframes', 570), ('<style>:target', 426), ('<div', 321), ('style="animation-name:x"', 285), ('contenteditable>drag', 282), ('contenteditable>drop', 282), ('style="transition:color', 282), ('{transform:', 144), ('style="transition:transform', 144), ('slidein', 143), ('style="animation-duration:1s;animation-name:slidein;animation-iteration-count:2"', 143), ('1', 143), ('2s"', 143), ('onfocusout=alert(1)', 142), ('x{from', 142), ('{left:0;}to', 142)]
Top tokens in safe: [('</a>', 3196), ('</li>', 2060), ('<a', 1608), ('</span>', 1322), ('<li><a', 1117), ('of', 1020), ('the', 966), ('and', 811), ('<span', 663), ('to', 644), ('<li', 604), ('</b>', 550), ('</a>,', 525), ('</sup>', 507), ('class="reference"><a', 474), ('class="external', 471), ('text"', 470), ('rel="nofollow"', 453), ('in', 448), ('a', 415)]


In [24]:
malicious = df[df['Label']==1]['Sentence'].head(10)
safe = df[df['Label']==0]['Sentence'].head(10)

print("Malicious samples:\n", malicious)
print("\nSafe samples:\n", safe)


Malicious samples:
 1                  <tt onmouseover="alert(1)">test</tt>
11    <a onblur=alert(1) tabindex=1 id=x></a><input ...
12    <col draggable="true" ondragenter="alert(1)">t...
13        <caption onpointerdown=alert(1)>XSS</caption>
16    <caption id=x tabindex=1 ondeactivate=alert(1)...
17          <style onmouseleave="alert(1)">test</style>
18                   <svg><meta onload=alert(1)></meta>
20    <figcaption onpointerleave=alert(1)>XSS</figca...
23    <datalist id=x tabindex=1 onfocusin=alert(1)><...
24            <input type=image src=1 onerror=alert(1)>
Name: Sentence, dtype: object

Safe samples:
 0     <li><a href="/wiki/File:Socrates.png" class="i...
2     \t </span> <span class="reference-text">Steeri...
3     \t </span> <span class="reference-text"><cite ...
4     \t </span>. <a href="/wiki/Digital_object_iden...
5     <li id="cite_note-118"><span class="mw-cite-ba...
6     <li><a href="/wiki/Contextualism" title="Conte...
7     <li id="cite_note-Representing_c

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13686 entries, 0 to 13685
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13686 non-null  int64 
 1   Sentence    13686 non-null  object
 2   Label       13686 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 320.9+ KB


In [6]:
df = df.drop(columns=["Unnamed: 0"])

In [7]:
df.isna().sum()

Sentence    0
Label       0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Label
count,13686.0
mean,0.538726
std,0.498516
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [9]:
X = df["Sentence"]
y = df["Label"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [33]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier(random_state=42)
)

In [34]:


scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
print("Cross-validation F1 scores:", scores)
print("Mean F1:", scores.mean())


Cross-validation F1 scores: [0.99762147 0.99830105 0.99898132 0.99728076 0.99762147]
Mean F1: 0.9979612176767129


In [37]:
pipeline.fit(X_train, y_train)
new_payloads = [
    "<script>alert('XSS')</script>",
    "<img src=x onerror=alert(1)>",
    "Welcome to profile page",
    "Hello user, how are you?"
]

preds = pipeline.predict(new_payloads)
list(zip(new_payloads, preds))


[("<script>alert('XSS')</script>", np.int64(1)),
 ('<img src=x onerror=alert(1)>', np.int64(1)),
 ('Welcome to profile page', np.int64(0)),
 ('Hello user, how are you?', np.int64(0))]

In [38]:
def obfuscate_examples(payload):
    variants = []
    # basic variants
    variants.append(payload)
    variants.append(payload.replace("<", "&lt;").replace(">", "&gt;"))             # entities
    variants.append(payload.replace("<script>", "<scr<script>ipt>"))               # broken tag
    variants.append(payload.replace("alert", "a l e r t"))                        # spaces in keyword
    variants.append(payload.replace("alert(", "al&#101;rt("))                      # entity inside
    variants.append(payload.upper())                                              # upper-case
    variants.append(payload.replace("<", "< "))                                   # space after <
    return list(set(variants))

xss_examples = ["<script>alert('XSS')</script>", "<img src=x onerror=alert(1)>"]
tests = []
for ex in xss_examples:
    tests += obfuscate_examples(ex)

# add some benign variants too
benign = ["Welcome to profile page", "Hello user, how are you?"]
tests += benign

for t in tests:
    print(t, "->", pipeline.predict([t])[0])


<script>a l e r t('XSS')</script> -> 0
< script>alert('XSS')< /script> -> 1
<scr<script>ipt>alert('XSS')</script> -> 1
<script>al&#101;rt('XSS')</script> -> 1
<script>alert('XSS')</script> -> 1
<SCRIPT>ALERT('XSS')</SCRIPT> -> 1
&lt;script&gt;alert('XSS')&lt;/script&gt; -> 1
<IMG SRC=X ONERROR=ALERT(1)> -> 1
<img src=x onerror=a l e r t(1)> -> 1
< img src=x onerror=alert(1)> -> 1
<img src=x onerror=alert(1)> -> 1
&lt;img src=x onerror=alert(1)&gt; -> 1
<img src=x onerror=al&#101;rt(1)> -> 1
Welcome to profile page -> 0
Hello user, how are you? -> 0


In [None]:
# ---------- Vectorization ----------
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,6), max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [12]:
model = LogisticRegression(max_iter=1000, class_weight="balanced")
model.fit(X_train_vec, y_train)

In [13]:
# ---------- Evaluate ----------
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Safe","XSS"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9985390796201608

Classification Report:
               precision    recall  f1-score   support

        Safe       1.00      1.00      1.00      1263
         XSS       1.00      1.00      1.00      1475

    accuracy                           1.00      2738
   macro avg       1.00      1.00      1.00      2738
weighted avg       1.00      1.00      1.00      2738


Confusion Matrix:
 [[1263    0]
 [   4 1471]]


In [39]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=42, n_estimators=200)
model_rf.fit(X_train_vec, y_train)


In [40]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1263
           1       1.00      1.00      1.00      1475

    accuracy                           1.00      2738
   macro avg       1.00      1.00      1.00      2738
weighted avg       1.00      1.00      1.00      2738

[[1263    0]
 [   4 1471]]


In [41]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, vectorizer.fit_transform(X), y, cv=5, scoring='f1')
print("Cross-validation F1 scores:", scores)
print("Mean F1:", scores.mean())


Cross-validation F1 scores: [0.99762147 0.99796196 0.99898132 0.99693982 0.99591559]
Mean F1: 0.9974840321421595


In [14]:
def predict_xss(samples):
    X_new = vectorizer.transform(samples)
    preds = model.predict(X_new)
    return ["Safe" if p == 0 else "XSS Attack" for p in preds]

In [15]:

test_samples = [
    #  Benign
    "<h1>Welcome to my website</h1>",
    "<p>This is a safe paragraph.</p>",
    "<img src=x onerror=alert('Hacked!')>",
    "<a href='https://github.com'>GitHub</a>",
    "<form action='/submit' method='post'><input type='text'></form>",
    "<img src='logo.png' alt='site logo'>",

    #  XSS Attacks
    "<script>alert('XSS')</script>",
    "<img src=x onerror=alert('Hacked!')>",
    "<body onload=alert('XSS')>",
    "<iframe src='javascript:alert(1)'></iframe>",
    "<a href='javascript:alert(1337)'>Click me</a>",
    "<div style=\"background:url(javascript:alert('XSS'))\">",
    "<svg/onload=alert('XSS')>",
    "<object data='javascript:alert(1)'>",
    "<meta http-equiv=\"refresh\" content=\"0;url=javascript:alert(1)\">",
    "<input type='text' value='<script>alert(1)</script>'>"
]

print("\nExtended Predictions:")
print(predict_xss(test_samples))



Extended Predictions:
['Safe', 'Safe', 'XSS Attack', 'Safe', 'Safe', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack', 'XSS Attack']


In [None]:

import joblib

joblib.dump(model, "xss_model.pkl")
joblib.dump(vectorizer, "xss_vectorizer.pkl")

print(" Model and Vectorizer saved successfully!")