<a href="https://colab.research.google.com/github/Hind117/DefenseMechanisms_Project/blob/main/DefenseMechanisms_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import libraries
import pandas as pd
import re
import nltk
import csv
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Data Exploring and cleaning**

In [None]:
#read dataset file
df = pd.read_csv("/content/XSS_dataset.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [None]:
#how many columns and rows
df.shape

(13686, 3)

In [None]:
#how many rows for each class
df['Label'].value_counts(normalize = True)

1    0.538726
0    0.461274
Name: Label, dtype: float64

In [None]:
#how many duplicates rows
df.duplicated(['Sentence']).sum()

2769

In [None]:
df.columns

Index(['Unnamed: 0', 'Sentence', 'Label'], dtype='object')

In [None]:
#delete the id column
df.pop('Unnamed: 0')

0            0
1            1
2            2
3            3
4            4
         ...  
13681    13681
13682    13682
13683    13683
13684    13684
13685    13685
Name: Unnamed: 0, Length: 13686, dtype: int64

In [None]:
df.columns

Index(['Sentence', 'Label'], dtype='object')

In [None]:
#how many duplicates data
df.duplicated().sum()

2769

In [None]:
#remove duplicates 
df.drop_duplicates(inplace = True)

In [None]:
df.shape

(10917, 2)

In [None]:
#how many rows for each class after removing duplicates
df['Label'].value_counts(normalize = True)

1    0.670789
0    0.329211
Name: Label, dtype: float64

Tokenization

In [None]:
#tokenize function 
def tokenize(payload):
    
    tokens=payload.lower() #lower case all words

    #dictionary for speical characters
    Dict = { '<': 'less', '>': 'gret', '/': 'slsh', '=': 'eql', '"': 'dqout',
             ':': 'coln', '(': 'opbrk', ')': 'cbrk', '#': 'hash', '&': 'and',
             '-': 'hyph', '_': 'udsc', '{': 'opcrl', '}': 'clcrl', '@': 'at',
             '^': 'pow', '.': 'dot', '%': 'perc', '$': 'dolr', '!': 'exlm', 
             '*': 'all', ';': 'semicoln', '\\': 'bslsh', '`': 'backtik', ',': 'coma',
             '‘': 'opapost', '’': 'capost', '+': 'plus', '~': 'tlde', '?': 'qusm',
             '[': 'opbrnt', ']': 'cbrnt', '\'': 'sqout', '|': 'pipe', '–': 'dhyph' }
    
    
    #replace special characters with the corresponding words in the dictionary
    for token in tokens:
        if token in Dict:
            Dict[token] = ' '+Dict[token]+' '
            tokens=tokens.replace(token, Dict[token])
        #regex for replacing integers with 'int'
        elif re.search("^([0-9]+)$", token):
            tokens=tokens.replace(token, ' int ')
    
    #tokenize each payload 
    tokens=word_tokenize(tokens)
    return " ".join(tokens)


#test the function        
text = '<svg><meta onload=alert(1)></meta>'
tokenize(text)

'less svg gret less meta onload eql alert opbrk int cbrk gret less slsh meta gret'

In [None]:
#add a new column (tokenized_payload) after applying tokenize function on 'Sentence' column
df['tokenized_payload'] = df['Sentence'].apply(tokenize)

In [None]:
df.head()

Unnamed: 0,Sentence,Label,tokenized_payload
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0,less li gret less a href eql dqout slsh wiki s...
1,"<tt onmouseover=""alert(1)"">test</tt>",1,less tt onmouseover eql dqout alert opbrk int ...
2,"\t </span> <span class=""reference-text"">Steeri...",0,less slsh span gret less span class eql dqout ...
3,"\t </span> <span class=""reference-text""><cite ...",0,less slsh span gret less span class eql dqout ...
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0,less slsh span gret dot less a href eql dqout ...


In [None]:
tokenized_payloads = df['tokenized_payload']

In [None]:
tokenized_payloads

0        less li gret less a href eql dqout slsh wiki s...
1        less tt onmouseover eql dqout alert opbrk int ...
2        less slsh span gret less span class eql dqout ...
3        less slsh span gret less span class eql dqout ...
4        less slsh span gret dot less a href eql dqout ...
                               ...                        
13678    less li gret less a href eql dqout slsh wiki s...
13681    less img onpointerenter eql alert opbrk int cb...
13682    less source onbeforepaste eql dqout alert opbr...
13683    less div draggable eql dqout true dqout conten...
13684    less li gret less cite id eql dqout citerefdom...
Name: tokenized_payload, Length: 10917, dtype: object

**Logistic Regression Classifier**

In [None]:
#splitting dataset into training and testing 
X = df['tokenized_payload']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42, shuffle=True, stratify=y)

#creating the model
model = LogisticRegression()

#word embedding
pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', model)])

#train the model
pipeline_model.fit(X_train, y_train)

#testing and calculating the accuracy
print('Accuracy:', pipeline_model.score(X_test, y_test)*100)

#prediction
y_pred = pipeline_model.predict(X_test)

#print classification report
print(classification_report(y_test, y_pred))

#confusion matrex
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(conf_matrix)

Accuracy: 99.77106227106228
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       719
           1       1.00      1.00      1.00      1465

    accuracy                           1.00      2184
   macro avg       1.00      1.00      1.00      2184
weighted avg       1.00      1.00      1.00      2184

[[ 717    2]
 [   3 1462]]


**Naive Bayes Classifier**

In [None]:

#creating the model
naive_bayes_model = MultinomialNB()

#word embedding
pipeline_naive_bayes_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', naive_bayes_model)])

#train the model
pipeline_naive_bayes_model.fit(X_train, y_train)

#testing and calculating the accuracy
print('Accuracy:', pipeline_naive_bayes_model.score(X_test, y_test)*100)

#prediction
y_pred_naive_bayes_model = pipeline_naive_bayes_model.predict(X_test)

#print classification report
print(classification_report(y_test, y_pred_naive_bayes_model))

nb_conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred_naive_bayes_model)
print(nb_conf_matrix)



Accuracy: 98.3974358974359
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       719
           1       0.98      1.00      0.99      1465

    accuracy                           0.98      2184
   macro avg       0.99      0.98      0.98      2184
weighted avg       0.98      0.98      0.98      2184

[[ 689   30]
 [   5 1460]]


**Support Vector Machine Classifier**

In [None]:

SVC_model = SVC()

pipeline_SVC_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', SVC_model)])

pipeline_SVC_model.fit(X_train, y_train)

print('Accuracy:', pipeline_SVC_model.score(X_test, y_test)*100)

y_SVC_pred = pipeline_SVC_model.predict(X_test)

print(classification_report(y_test, y_SVC_pred))

svc_conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_SVC_pred)
print(svc_conf_matrix)

Accuracy: 99.86263736263736
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       719
           1       1.00      1.00      1.00      1465

    accuracy                           1.00      2184
   macro avg       1.00      1.00      1.00      2184
weighted avg       1.00      1.00      1.00      2184

[[ 718    1]
 [   2 1463]]


**Random Forest Classifier**

In [None]:

RF_model = RandomForestClassifier()

pipeline_RF_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', RF_model)])

pipeline_RF_model.fit(X_train, y_train)

print('Accuracy:', pipeline_RF_model.score(X_test, y_test)*100)

y_RF_pred = pipeline_RF_model.predict(X_test)

print(classification_report(y_test, y_RF_pred))

rf_conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_RF_pred)
print(rf_conf_matrix)

Accuracy: 99.86263736263736
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       719
           1       1.00      1.00      1.00      1465

    accuracy                           1.00      2184
   macro avg       1.00      1.00      1.00      2184
weighted avg       1.00      1.00      1.00      2184

[[ 718    1]
 [   2 1463]]


**Decision Tree Classifier**

In [None]:

DT_model = DecisionTreeClassifier()


#word embedding
pipeline_DT_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', DT_model)])

pipeline_DT_model.fit(X_train, y_train) #train the model

#testing and calculating the accuracy
print('Accuracy:', pipeline_DT_model.score(X_test, y_test)*100)

#predicting
y_DT_pred = pipeline_DT_model.predict(X_test)

print(classification_report(y_test, y_DT_pred))

dt_conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_DT_pred)
print(dt_conf_matrix)

Accuracy: 99.54212454212454
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       719
           1       1.00      1.00      1.00      1465

    accuracy                           1.00      2184
   macro avg       0.99      1.00      0.99      2184
weighted avg       1.00      1.00      1.00      2184

[[ 716    3]
 [   7 1458]]


**FastText Model**

In [None]:
#splitting dataset into training and testing
train, test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=y)
#save in csv files
train.to_csv('/content/train.csv',index=False)
test.to_csv('/content/test.csv',index=False)

In [None]:
#read csv files
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

In [None]:
train.head()

Unnamed: 0,Sentence,Label,tokenized_payload
0,"<li class=""toctree-l1""><a class=""reference int...",0,less li class eql dqout toctree hyph l int dqo...
1,"<div draggable=""true"" contenteditable>drag me<...",1,less div draggable eql dqout true dqout conten...
2,"<button draggable=""true"" ondragleave=""alert(1)...",1,less button draggable eql dqout true dqout ond...
3,<map onpointerout=alert(1)>XSS</map>,1,less map onpointerout eql alert opbrk int cbrk...
4,"<!DOCTYPE html><html class=""no-js"" lang=""en"" >",0,less exlm doctype html gret less html class eq...


In [None]:
#remove 'Sentence' column
train.pop('Sentence')

0       <li class="toctree-l1"><a class="reference int...
1       <div draggable="true" contenteditable>drag me<...
2       <button draggable="true" ondragleave="alert(1)...
3                    <map onpointerout=alert(1)>XSS</map>
4         <!DOCTYPE html><html class="no-js" lang="en" > 
                              ...                        
8728    <style>:target {transform: rotate(180deg);}</s...
8729    <menuitem id=x tabindex=1 onfocus=alert(1)></m...
8730    <li><cite id="CITEREFMaker2006" class="citatio...
8731                   <s onmousemove="alert(1)">test</s>
8732    <font id=x tabindex=1 onactivate=alert(1)></font>
Name: Sentence, Length: 8733, dtype: object

In [None]:
train.head()

Unnamed: 0,Label,tokenized_payload
0,0,less li class eql dqout toctree hyph l int dqo...
1,1,less div draggable eql dqout true dqout conten...
2,1,less button draggable eql dqout true dqout ond...
3,1,less map onpointerout eql alert opbrk int cbrk...
4,0,less exlm doctype html gret less html class eq...


In [None]:
#add __label__ 
train['Label'] = train['Label'].map({0:'__label__Nonmalicious', 1:'__label__Malicious'})

In [None]:
train.head()

Unnamed: 0,Label,tokenized_payload
0,__label__Nonmalicious,less li class eql dqout toctree hyph l int dqo...
1,__label__Malicious,less div draggable eql dqout true dqout conten...
2,__label__Malicious,less button draggable eql dqout true dqout ond...
3,__label__Malicious,less map onpointerout eql alert opbrk int cbrk...
4,__label__Nonmalicious,less exlm doctype html gret less html class eq...


In [None]:
#add __label__ 
test['Label'] = test['Label'].map({0:'__label__Nonmalicious', 1:'__label__Malicious'})

In [None]:
test.head()

Unnamed: 0,Sentence,Label,tokenized_payload
0,<keygen onpointerdown=alert(1)>XSS</keygen>,__label__Malicious,less keygen onpointerdown eql alert opbrk int ...
1,<link id=x tabindex=1 ondeactivate=alert(1)></...,__label__Malicious,less link id eql x tabindex eql int ondeactiva...
2,"<li id=""cite_note-sak2014-269""><span class=""mw...",__label__Nonmalicious,less li id eql dqout cite udsc note hyph sak i...
3,"<pre oncontextmenu=""alert(1)"">test</pre>",__label__Malicious,less pre oncontextmenu eql dqout alert opbrk i...
4,"<center onkeydown=""alert(1)"" contenteditable>t...",__label__Malicious,less center onkeydown eql dqout alert opbrk in...


In [None]:
#remove 'Sentence' column
test.pop('Sentence')

0             <keygen onpointerdown=alert(1)>XSS</keygen>
1       <link id=x tabindex=1 ondeactivate=alert(1)></...
2       <li id="cite_note-sak2014-269"><span class="mw...
3                <pre oncontextmenu="alert(1)">test</pre>
4       <center onkeydown="alert(1)" contenteditable>t...
                              ...                        
2179    <ruby onkeypress="alert(1)" contenteditable>te...
2180                        <input id=x onfocus=alert(1)>
2181    <ul><li><a href="#CITEREFLucas1961">Lucas 1961...
2182               <small onclick="alert(1)">test</small>
2183    <section id=x tabindex=1 ondeactivate=alert(1)...
Name: Sentence, Length: 2184, dtype: object

In [None]:
test.head()

Unnamed: 0,Label,tokenized_payload
0,__label__Malicious,less keygen onpointerdown eql alert opbrk int ...
1,__label__Malicious,less link id eql x tabindex eql int ondeactiva...
2,__label__Nonmalicious,less li id eql dqout cite udsc note hyph sak i...
3,__label__Malicious,less pre oncontextmenu eql dqout alert opbrk i...
4,__label__Malicious,less center onkeydown eql dqout alert opbrk in...


In [None]:
#convert CSV files to text files
train[['Label', 'tokenized_payload']].to_csv('/content/train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")
test[['Label', 'tokenized_payload']].to_csv('/content/test.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")



In [None]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 6.8 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3144540 sha256=76e4425fac7bf42bf107c9437674e76bda7b2814bbf9b3caf9a189bdd8ae7fa0
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.2


In [None]:
import fasttext

In [None]:
#train fasttext model
fasttext_model = fasttext.train_supervised('/content/train.txt')

In [None]:
#test the model 

Number_of_Samples, Precision, Recall = fasttext_model.test('/content/test.txt') #test the model

F_Measure = ((2 * Precision * Recall) / (Precision + Recall))
print(F_Measure*100)

99.81684981684981


In [None]:
#calculate the accuracy

#add 'predicted' column after applying model.predict() function to 'tokenized_payload' column
df["predicted"] = df["tokenized_payload"].apply(lambda x: fasttext_model.predict(x)[0][0])

#add __label__
df['predicted'] = df['predicted'].map({'__label__Nonmalicious':0, '__label__Malicious':1})

#calculate confusion_matrix
[[TP, FP], [FN, TN]] = confusion_matrix(df["Label"], df["predicted"])

#calculate the accuracy
Accuracy = (TP+TN)/(TP+TN+FP+FN)
print(Accuracy*100)

99.79847943574242


In [None]:
#test the model using a new payload
#first tokenize it
print(tokenize('<svg><a><animate attributeName=href values=javascript:alert(1) /><text x=20 y=20>Click me</text></a></svg>'))


less svg gret less a gret less animate attributename eql href values eql javascript coln alert opbrk int cbrk slsh gret less text x eql int int y eql int int gret click me less slsh text gret less slsh a gret less slsh svg gret


In [None]:
#now predict the label
fasttext_model.predict('less svg gret less a gret less animate attributename eql href values eql javascript coln alert opbrk int cbrk slsh gret less text x eql int int y eql int int gret click me less slsh text gret less slsh a gret less slsh svg gret')

(('__label__Malicious',), array([0.97377121]))

In [None]:
print(tokenize('<x ondrag=alert(1)>drag this!'))

less x ondrag eql alert opbrk int cbrk gret drag this exlm


In [None]:
fasttext_model.predict('less x ondrag eql alert opbrk int cbrk gret drag this exlm')

(('__label__Malicious',), array([1.00000811]))