In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import model_from_json

import pickle

In [12]:
def load_csv():
    global sqlidf
    contents=[]
    with open("sqli.csv",'r',encoding = 'utf-8') as f:
        for line in f:
            word=line.split('\n')
            list2 = [x for x in word if x]
            list1 = list2[0].rsplit(',',maxsplit=1)
            sentence=list1[0][1:]
            label=list1[1][:-1]
            listx=[sentence,label]
            contents += [listx]

    contents=contents[1:]
    sqlidf = pd.DataFrame(contents,columns=['Sentence','Label'])
    
load_csv()

In [13]:
sqlidf['Sentence'] = sqlidf['Sentence'].astype(str)
sqlidf['Label']=sqlidf['Label'].astype(int)

In [14]:
X=sqlidf['Sentence']
y=sqlidf['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
#X_test.to_csv("xtest.csv")

In [16]:
vectorizer = CountVectorizer()
posts = vectorizer.fit_transform(X_train).toarray()
test_posts = vectorizer.transform(X_test).toarray()

In [17]:
print(posts[5])
print(posts.shape)

[0 0 0 ... 0 0 0]
(3360, 8781)


In [18]:
input_dim = len(vectorizer.vocabulary_)

model = Sequential()
model.add(layers.Dense(20, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(10,  activation='tanh'))
model.add(layers.Dense(1024, activation='relu'))

model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

In [19]:
print(input_dim)

8781


In [20]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                175640    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              11264     
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1025      
Total params: 192,235
Trainable params: 190,187
Non-trainable params: 2,048
______________________________________________

In [21]:
classifier_nn = model.fit(posts,y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(test_posts, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
pickle.dump(vectorizer.vocabulary_, open("dictionary.pickle", 'wb'))

In [23]:
pred=model.predict(test_posts)

In [24]:
for i in range(len(pred)):
    if pred[i]>0.5:
        pred[i]=1
    elif pred[i]<=0.5:
        pred[i]=0

In [25]:
def accuracy_function(tp,tn,fp,fn):
    
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    
    return accuracy


def precision_function(tp,fp):
    
    precision = tp / (tp+fp)
    
    return precision


def recall_function(tp,fn):
    
    recall=tp / (tp+fn)
    
    return recall


def confusion_matrix(truth,predicted):
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for true,pred in zip(truth,predicted):
        if true == 1:
            if pred == true:
                true_positive += 1
            elif pred != true:
                false_negative += 1

        elif true == 0:
            if pred == true:
                true_negative += 1
            elif pred != true:
                false_positive += 1
            
    accuracy=accuracy_function(true_positive, true_negative, false_positive, false_negative)
    precision=precision_function(true_positive, false_positive)
    recall=recall_function(true_positive, false_negative)
    confusion_matrix_res = [[true_negative, false_negative],[false_positive,true_positive]]
    
    return (accuracy,
            precision,
           recall,
           confusion_matrix_res)

In [26]:
accuracy,precision,recall, matrix =confusion_matrix(y_test,pred)
print(" Accuracy : {0} \n Precision : {1} \n Recall : {2} \n Confusion matrix: {3}".format(accuracy, precision, recall, matrix))

 Accuracy : 0.9773809523809524 
 Precision : 0.9953271028037384 
 Recall : 0.922077922077922 
 Confusion matrix: [[608, 18], [1, 213]]


# Testing

In [27]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [28]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# # evaluate loaded model on test data
# loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# score = loaded_model.evaluate(posts, y_train, verbose=0)
# print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

Loaded model from disk


In [29]:
import sklearn
def testing1(querystring):
    instance = X_test
    instance.iloc[0] = querystring[0]
    vocabulary_to_load = pickle.load(open("dictionary.pickle", 'rb'))
    loaded_vectorizer = sklearn.feature_extraction.text.CountVectorizer(vocabulary=vocabulary_to_load)
    loaded_vectorizer._validate_vocabulary()
    instance_posts = loaded_vectorizer.transform(instance).toarray()
    
    pred = loaded_model.predict(instance_posts)
    
    if pred[0]>0.5:
        res=1
    else:
        res=0
        
    return res

In [30]:
import time
#hello world!
start = time.time()
print(testing1(["105 OR 1=1"]))
stop = time.time()
print("time = ",stop-start)



1
time =  0.1441035270690918


In [33]:
# # load and evaluate a saved model
# from numpy import loadtxt
# from tensorflow.keras.models import load_model
 
# # load model
# model = load_model('model.h5')
# # summarize model.
# model.summary()
# # load dataset