In [33]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
import keras
from keras.models import Sequential
from keras import layers
import tensorflow as tf
from keras.models import model_from_json
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D,Flatten
import cv2
import pickle

## SQLI

In [None]:
def load_csv():
    global sqlidf
    contents=[]
    with open("sqli.csv",'r',encoding = 'utf-8') as f:
        for line in f:
            word=line.split('\n')
            list2 = [x for x in word if x]
            list1 = list2[0].rsplit(',',maxsplit=1)
            sentence=list1[0][1:]
            label=list1[1][:-1]
            listx=[sentence,label]
            contents += [listx]

    contents=contents[1:]
    sqlidf = pd.DataFrame(contents,columns=['Sentence','Label'])
    
load_csv()

: 

In [3]:
sqlidf['Sentence'] = sqlidf['Sentence'].astype(str)
sqlidf['Label']=sqlidf['Label'].astype(int)

In [4]:
X=sqlidf['Sentence']
y=sqlidf['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
vectorizer = CountVectorizer()
posts = vectorizer.fit_transform(X_train).toarray()
test_posts = vectorizer.transform(X_test).toarray()

In [6]:
input_dim = len(vectorizer.vocabulary_)

model = Sequential()
model.add(layers.Dense(128, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(128,  activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [7]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1135616   
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,160,449
Trainable params: 1,160,449
Non-trainable params: 0
_________________________________________________________________


In [8]:
classifier_nn = model.fit(posts,y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(test_posts, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
pickle.dump(vectorizer.vocabulary_, open("dictionary.pickle", 'wb'))

In [10]:
pred=model.predict(test_posts)

In [11]:
for i in range(len(pred)):
    if pred[i]>0.5:
        pred[i]=1
    elif pred[i]<=0.5:
        pred[i]=0

In [12]:
def accuracy_function(tp,tn,fp,fn):
    
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    
    return accuracy


def precision_function(tp,fp):
    
    precision = tp / (tp+fp)
    
    return precision


def recall_function(tp,fn):
    
    recall=tp / (tp+fn)
    
    return recall


def confusion_matrix(truth,predicted):
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for true,pred in zip(truth,predicted):
        if true == 1:
            if pred == true:
                true_positive += 1
            elif pred != true:
                false_negative += 1

        elif true == 0:
            if pred == true:
                true_negative += 1
            elif pred != true:
                false_positive += 1
            
    accuracy=accuracy_function(true_positive, true_negative, false_positive, false_negative)
    precision=precision_function(true_positive, false_positive)
    recall=recall_function(true_positive, false_negative)
    confusion_matrix_res = [[true_negative, false_negative],[false_positive,true_positive]]
    
    return (accuracy,
            precision,
           recall,
           confusion_matrix_res)

In [13]:
accuracy,precision,recall, matrix =confusion_matrix(y_test,pred)
print(" Accuracy : {0} \n Precision : {1} \n Recall : {2} \n Confusion matrix: {3}".format(accuracy, precision, recall, matrix))

 Accuracy : 0.9773809523809524 
 Precision : 0.9282868525896414 
 Recall : 0.9957264957264957 
 Confusion matrix: [[588, 1], [18, 233]]


In [14]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save("sqli_model.h5")
print("Saved model to disk")

Saved model to disk


## Testing

In [18]:
import sklearn
def testing1(querystring):
    instance = X_test
    instance.iloc[0] = querystring[0]
    vocabulary_to_load = pickle.load(open("dictionary.pickle", 'rb'))
    loaded_vectorizer = sklearn.feature_extraction.text.CountVectorizer(vocabulary=vocabulary_to_load)
    loaded_vectorizer._validate_vocabulary()
    instance_posts = loaded_vectorizer.transform(instance).toarray()
    
    pred = model.predict(instance_posts)
    
    if pred[0]>0.5:
        res=1
    else:
        res=0
        
    return res

In [19]:
import time
#hello world!
start = time.time()
print(testing1(["105 OR 1=1"]))
stop = time.time()
print("time = ",stop-start)

1
time =  0.1817927360534668


In [20]:
# load and evaluate a saved model
from numpy import loadtxt
from keras.models import load_model
 
# load model
model = load_model('model.h5')
# summarize model.
model.summary()
# load dataset

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1140352   
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,165,185
Trainable params: 1,165,185
Non-trainable params: 0
_________________________________________________________________


## XSS

In [21]:
def load_csv():
    contents=[]
    with open("XSS_dataset.csv",'r') as f:
        for line in f:
            word = line.split('\n')
            sentence = word[0]
            index , string = sentence.split(',',maxsplit=1)
            sentence, label = string.rsplit(',',maxsplit=1)
            #sentence = sentence.strip('"')
            contents += [[sentence , label]]


    contents=contents[1:]
    #print(contents)
    global xssdf
    xssdf = pd.DataFrame(contents,columns=['Sentence','Label'])
    xssdf = xssdf.replace({'\t': ''}, regex=True)
    xssdf['Sentence'] = xssdf['Sentence'].astype(str)
    xssdf['Label']=xssdf['Label'].astype(int)
    
    
load_csv()

In [22]:
X = xssdf['Sentence']
y = xssdf['Label'].values
trainX, testX, trainY, testY = train_test_split(X,y, test_size=0.2)

In [23]:
train_sentences=trainX.values
test_sentences=testX.values

In [24]:
def convert_to_ascii(sentence):
    sentence_ascii=[]

    for i in sentence:
       
        if(ord(i)<8222):      # ”  :  8221
            
            if(ord(i)==8217): # ’  :  8217 
                sentence_ascii.append(134)
            
            
            if(ord(i)==8221): # ”  :  8221 ""
                sentence_ascii.append(129)
                
            if(ord(i)==8220): # “  :  8220
                sentence_ascii.append(130)
                
                
            if(ord(i)==8216): # ‘  :  8216
                sentence_ascii.append(131)
                
            if(ord(i)==8217): # ’  :  8217
                sentence_ascii.append(132)
            
            if(ord(i)==8211): # –  :  8211
                sentence_ascii.append(133)
                
                
            """
            If values less than 128 store them else discard them
            """
            if (ord(i)<=128):
                    sentence_ascii.append(ord(i))
    
            else:
                    pass
            

    zer=np.zeros((10000))

    for i in range(len(sentence_ascii)):
        zer[i]=sentence_ascii[i]

    zer.shape=(100, 100)

    return zer

In [25]:
def preprocessing(sentences):
    arr=np.zeros((len(sentences),100,100))
 
    for i in range(len(sentences)):

        image=convert_to_ascii(sentences[i])

        x=np.asarray(image,dtype='float')
        image =  cv2.resize(x, dsize=(100,100), interpolation=cv2.INTER_CUBIC)
        image/=128
        arr[i]=image
    return arr

In [26]:
train_arr = preprocessing(train_sentences)
test_arr = preprocessing(test_sentences)

In [27]:
data = train_arr.reshape(train_arr.shape[0], 100, 100, 1)
test_data = test_arr.reshape(test_arr.shape[0], 100, 100, 1)
print("Train data shape : ",data.shape)
print("Test data shape : ",test_data.shape)

Train data shape :  (10952, 100, 100, 1)
Test data shape :  (2739, 100, 100, 1)


In [30]:

model=tf.keras.models.Sequential([
    
    tf.keras.layers.Conv2D(64,(3,3), activation=tf.nn.relu, input_shape=(100,100,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(128,(3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(256,(3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    
])

In [31]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 98, 98, 64)        640       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 49, 49, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 47, 47, 128)       73856     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 23, 23, 128)      0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 21, 21, 256)       295168    
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 10, 10, 256)     

In [35]:
num_classes = 2 
train_y = keras.utils.np_utils.to_categorical(data, num_classes)
test_y = keras.utils.np_utils.to_categorical(test_data, num_classes)

In [36]:
batch_size = 128
num_epoch = 10

#model training
model_log = model.fit(data, trainY,
          batch_size=batch_size,
          epochs=num_epoch,
          verbose=1,
          validation_data=( test_data,  testY)
                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
pred=model.predict(test_data)

In [38]:
for i in range(len(pred)):
    if pred[i]>0.5:
        pred[i]=1
    elif pred[i]<=0.5:
        pred[i]=0

In [39]:
accuracy,precision,recall, matrix =confusion_matrix(testY,pred)
print(" Accuracy : {0} \n Precision : {1} \n Recall : {2} \n Confusion matrix: {3}".format(accuracy, precision, recall, matrix))

 Accuracy : 0.9905074844833881 
 Precision : 0.9911684782608695 
 Recall : 0.9911684782608695 
 Confusion matrix: [[1254, 13], [13, 1459]]


### Testing 

In [43]:
def testing(querystring):
    instance=[]
    instance = testX
    instance = instance[:250]
    instance[-1] = querystring[0]
    test_instance=instance.values
    instance_arr = preprocessing(test_instance)
    instance_data = instance_arr.reshape(instance_arr.shape[0], 100, 100, 1)
    pred=model.predict(instance_data)
    if pred[-1]>0.5:
            res=1
    else:
            res=0

    print(res)

In [44]:
# serialize model to JSON
model_json = model.to_json()
with open("xssmodel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save("xssmodel.h5")
print("Saved model to disk")

Saved model to disk


In [46]:
testing(['architha aaa'])

0
