In [1]:
#pip install opencv-python


In [2]:
import numpy as np
import pandas as pd

import keras
import cv2

from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D,Flatten



In [3]:
def load_csv():
    contents=[]
    with open("XSS_dataset.csv",'r') as f:
        for line in f:
            word = line.split('\n')
            sentence = word[0]
            index , string = sentence.split(',',maxsplit=1)
            sentence, label = string.rsplit(',',maxsplit=1)
            #sentence = sentence.strip('"')
            contents += [[sentence , label]]


    contents=contents[1:]
    #print(contents)
    global xssdf
    xssdf = pd.DataFrame(contents,columns=['Sentence','Label'])
    xssdf = xssdf.replace({'\t': ''}, regex=True)
    xssdf['Sentence'] = xssdf['Sentence'].astype(str)
    xssdf['Label']=xssdf['Label'].astype(int)
    
    
load_csv()

In [4]:
X = xssdf['Sentence']
y = xssdf['Label'].values
trainX, testX, trainY, testY = train_test_split(X,y, test_size=0.2)

In [5]:
train_sentences=trainX.values
test_sentences=testX.values

In [6]:
def convert_to_ascii(sentence):
    sentence_ascii=[]

    for i in sentence:
       
        if(ord(i)<8222):      # ”  :  8221
            
            if(ord(i)==8217): # ’  :  8217 
                sentence_ascii.append(134)
            
            
            if(ord(i)==8221): # ”  :  8221 ""
                sentence_ascii.append(129)
                
            if(ord(i)==8220): # “  :  8220
                sentence_ascii.append(130)
                
                
            if(ord(i)==8216): # ‘  :  8216
                sentence_ascii.append(131)
                
            if(ord(i)==8217): # ’  :  8217
                sentence_ascii.append(132)
            
            if(ord(i)==8211): # –  :  8211
                sentence_ascii.append(133)
                
                
            """
            If values less than 128 store them else discard them
            """
            if (ord(i)<=128):
                    sentence_ascii.append(ord(i))
    
            else:
                    pass
            

    zer=np.zeros((10000))

    for i in range(len(sentence_ascii)):
        zer[i]=sentence_ascii[i]

    zer.shape=(100, 100)

    return zer

In [7]:
def preprocessing(sentences):
    arr=np.zeros((len(sentences),100,100))
 
    for i in range(len(sentences)):

        image=convert_to_ascii(sentences[i])

        x=np.asarray(image,dtype='float')
        image =  cv2.resize(x, dsize=(100,100), interpolation=cv2.INTER_CUBIC)
        image/=128
        arr[i]=image
    return arr

In [8]:
train_arr = preprocessing(train_sentences)

In [9]:
test_arr = preprocessing(test_sentences)

In [10]:
#Reshape data for input to CNN
data = train_arr.reshape(train_arr.shape[0], 100, 100, 1)
test_data = test_arr.reshape(test_arr.shape[0], 100, 100, 1)
print("Train data shape : ",data.shape)
print("Test data shape : ",test_data.shape)

Train data shape :  (10952, 100, 100, 1)
Test data shape :  (2739, 100, 100, 1)


In [11]:
# A basic CNN Model

model=tf.keras.models.Sequential([
    
    tf.keras.layers.Conv2D(64,(3,3), activation=tf.nn.relu, input_shape=(100,100,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(128,(3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(256,(3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    
])

In [12]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 98, 98, 64)        640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 49, 49, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 47, 47, 128)       73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 23, 23, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 21, 21, 256)       295168    
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 10, 10, 256)       0         
_________________________________________________________________
flatten (Flatten)            (None, 25600)             0

In [13]:
num_classes = 2 
train_y = keras.utils.to_categorical(data, num_classes)
test_y = keras.utils.to_categorical(test_data, num_classes)

In [14]:
batch_size = 128
num_epoch = 10

#model training
model_log = model.fit(data, trainY,
          batch_size=batch_size,
          epochs=num_epoch,
          verbose=1,
          validation_data=( test_data,  testY)
                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# predict for test set
pred=model.predict(test_data)

In [16]:
# Threshold values predicted

for i in range(len(pred)):
    if pred[i]>0.5:
        pred[i]=1
    elif pred[i]<=0.5:
        pred[i]=0

In [17]:
def accuracy_function(tp,tn,fp,fn):
    
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    
    return accuracy



def precision_function(tp,fp):
    
    precision = tp / (tp+fp)
    
    return precision



def recall_function(tp,fn):
    
    recall=tp / (tp+fn)
    
    return recall



def confusion_matrix(truth,predicted):
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for true,pred in zip(truth,predicted):
        if true == 1:
            if pred == true:
                true_positive += 1
            elif pred != true:
                false_negative += 1

        elif true == 0:
            if pred == true:
                true_negative += 1
            elif pred != true:
                false_positive += 1
            
    accuracy=accuracy_function(true_positive, true_negative, false_positive, false_negative)
    precision=precision_function(true_positive, false_positive)
    recall=recall_function(true_positive, false_negative)
    confusion_matrix_res = [[true_negative, false_negative],[false_positive,true_positive]]
    
    return (accuracy,
            precision,
           recall,
           confusion_matrix_res)

In [18]:
accuracy,precision,recall, matrix =confusion_matrix(testY,pred)
print(" Accuracy : {0} \n Precision : {1} \n Recall : {2} \n Confusion matrix: {3}".format(accuracy, precision, recall, matrix))

 Accuracy : 0.9875867104782767 
 Precision : 0.9905341446923597 
 Recall : 0.9865319865319865 
 Confusion matrix: [[1240, 20], [14, 1465]]


# Testing

In [19]:
def testing(querystring):
    instance=[]
    instance = testX
    instance = instance[:250]
    instance[-1] = querystring[0]
    test_instance=instance.values
    instance_arr = preprocessing(test_instance)
    instance_data = instance_arr.reshape(instance_arr.shape[0], 100, 100, 1)
    pred=loaded_model.predict(instance_data)
    if pred[-1]>0.5:
            res=1
    else:
            res=0

    print(res)

In [20]:
# serialize model to JSON
model_json = model.to_json()
with open("xssmodel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("xssmodel.h5")
print("Saved model to disk")

Saved model to disk


In [21]:
from keras.models import model_from_json
# load json and create model
json_file = open('xssmodel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("xssmodel.h5")
print("Loaded model from disk")
 


Loaded model from disk


In [22]:
#hello world!
#<script>alert(document.cookie())</script>
testing(['architha aaa'])

0
