In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix
from tensorflow.keras import models, layers
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('XSS_dataset.csv', encoding='utf-8-sig')
X = df['Sentence']
y = df['Label']

In [3]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

In [4]:
vectorizer = CountVectorizer(min_df = 2, max_df = 0.8, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(X.values.astype('U')).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(10948, 5640)
(10948,)
(2738, 5640)
(2738,)


In [5]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
print(f"Accuracy of Logistic Regression on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of Logistic Regression on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)

Accuracy of Logistic Regression on test set : 0.9985390796201608
F1 Score of Logistic Regression on test set : 0.9986101459346769
sensitivity= 0.9979166666666667
specificity= 0.9992295839753467
Precision= 0.9993045897079277


In [6]:
from sklearn.ensemble import AdaBoostClassifier

In [7]:
lr_clf = AdaBoostClassifier(n_estimators=100)
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
print(f"Accuracy of AadaBoost on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of AadaBoost on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)

Accuracy of AadaBoost on test set : 0.9992695398100804
F1 Score of AadaBoost on test set : 0.9993050729673384
sensitivity= 0.9986111111111111
specificity= 1.0
Precision= 1.0


In [8]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print(f"Accuracy of Naive Bayes on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of Naive Bayes on test set : {f1_score(y_pred, y_test)}")

confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)

Accuracy of Naive Bayes on test set : 0.8889700511322133
F1 Score of Naive Bayes on test set : 0.9044626021370208
sensitivity= 0.9993055555555556
specificity= 0.7665639445300462
Precision= 0.8260619977037887


In [9]:
import xgboost as xgb

In [10]:
nb_clf = xgb.XGBClassifier(n_estimators=100)
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print(f"Accuracy of XGBoost on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of XGBoost on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)

Accuracy of XGBoost on test set : 0.9970781592403214
F1 Score of XGBoost on test set : 0.9972183588317107
sensitivity= 0.9958333333333333
specificity= 0.9984591679506933
Precision= 0.9986072423398329


In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
nb_clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print(f"Accuracy of Decision Tree on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of Decision Tree on test set : {f1_score(y_pred, y_test)}")
confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

sensitivity = TP / float(FN + TP)

print("sensitivity=",sensitivity)
specificity = TN / (TN + FP)
print("specificity=",specificity)


Precision = TP / float(TP + FP)
Recall = TP / float(TP + FN)
F1 = 2*((Precision*Recall)/(Precision+Recall))
print ("Precision=",Precision)

Accuracy of Decision Tree on test set : 0.9989043097151206
F1 Score of Decision Tree on test set : 0.9989579715178881
sensitivity= 0.9986111111111111
specificity= 0.9992295839753467
Precision= 0.9993050729673384
