In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn import tree
import tensorflow as tf
from tensorflow.keras.utils import plot_model # type: ignore
from tensorflow.keras import models, layers # type: ignore
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("./SQL_Dataset.csv")
print("Data Shape:", df.shape)
df = df.sample(frac = 1)
print(df.head(10))
X = df['Query']
y = df['Label']

Data Shape: (30919, 2)
                                                   Query  Label
15622                                          chapurrar      0
4165   1' where 8837  =  8837 and 5556  =    (  selec...      1
19278                                               dido      0
363     select * from users where id  =  1 or ";[" or...      1
25702   SELECT * FROM broad ORDER BY took ASC, happen...      0
13463                                           claudine      0
10262  1" where 7232  =  7232   (  select   (  case w...      1
28942  SELECT * FROM tribe WHERE ought = 'possible'  ...      0
22016             SELECT COUNT ( activityID ) FROM ready      0
29252                             SELECT * FROM creature      0


In [48]:
import nltk
nltk.download('stopwords')
vectorizer = CountVectorizer(min_df = 2, max_df = 0.8, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(X.values.astype('U')).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/otterilyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(27827, 6594) (27827,)
(3092, 6594) (3092,)


In [53]:
model = models.Sequential()
model.add(layers.Conv1D(32, 1, activation = 'relu', input_shape = (1,6594)))
model.add(layers.Conv1D(32, 1, activation = 'relu'))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
model.summary()

# model = models.Sequential()
# # First Convolutional Layer
# model.add(layers.Conv1D(2048, 1, activation='relu', input_shape=(1, 6594)))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling1D(1, strides=4))

# # Second Convolutional Layer
# model.add(layers.Conv1D(2048, 1, padding='same', activation='relu'))
# model.add(layers.BatchNormalization())
# model.add(layers.MaxPooling1D(1, strides=4))

# # Other Convolutional Layers
# model.add(layers.Conv1D(1024, 1, padding='same', activation='relu'))
# model.add(layers.Conv1D(512, 1, padding='same', activation='relu'))
# model.add(layers.Conv1D(256, 1, padding='same', activation='relu'))
# model.add(layers.Conv1D(128, 1, padding='same', activation='relu'))
# model.add(layers.Conv1D(64, 1, padding='same', activation='relu'))
# model.add(layers.Conv1D(32, 1, padding='same', activation='relu'))

# # Flattening the outputs from the convolutional layers to feed into the dense layers
# model.add(layers.Flatten())

# # First Dense Layer
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dropout(0.5))

# # Output Layer
# model.add(layers.Dense(1, activation='sigmoid'))

# # Compile the model
# model.compile(optimizer='adam',
#               loss=tf.keras.losses.BinaryCrossentropy(),
#               metrics=['accuracy'])

# # Model Summary
# model.summary()

X_train1 = X_train.reshape(-1, 1, 6594)
X_test1 = X_test.reshape(-1, 1, 6594)

history = model.fit(X_train1, y_train, epochs = 10, validation_data = (X_test1, y_test))

Epoch 1/10
[1m870/870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8764 - loss: 0.3073 - val_accuracy: 0.9622 - val_loss: 0.1058
Epoch 2/10
[1m870/870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9789 - loss: 0.0674 - val_accuracy: 0.9628 - val_loss: 0.1210
Epoch 3/10
[1m870/870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9838 - loss: 0.0545 - val_accuracy: 0.9631 - val_loss: 0.1237
Epoch 4/10
[1m870/870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9843 - loss: 0.0536 - val_accuracy: 0.9631 - val_loss: 0.1294
Epoch 5/10
[1m870/870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9839 - loss: 0.0525 - val_accuracy: 0.9622 - val_loss: 0.1356
Epoch 6/10
[1m870/870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9839 - loss: 0.0525 - val_accuracy: 0.9612 - val_loss: 0.1434
Epoch 7/10
[1m870/870[0m 

In [54]:
y_pred = model.predict(X_test1).flatten()
# y_pred1 = [1 if x>-0.5 else 0 for x in y_pred]
y_pred = np.round(y_pred)
print(f"Accuracy of CNN on test set : {accuracy_score(y_pred, y_test)}")
print(f"F1 Score of CNN on test set : {f1_score(y_pred, y_test)}")

[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy of CNN on test set : 0.9605433376455369
F1 Score of CNN on test set : 0.9446460980036298


In [55]:
new_query = ["SELECT * FROM messages WHERE name='Mason' AND password='123456'"]
new_query = vectorizer.transform(new_query).toarray()
new_query = new_query.reshape(-1, 1, 6594)
y_pred = model.predict(new_query).flatten()
print(y_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[0.9998115]


In [56]:
new_query = ["SELECT * FROM messages WHERE name='' OR 'a'='a';--' AND password='123456qwer'"]
new_query = vectorizer.transform(new_query).toarray()
new_query = new_query.reshape(-1, 1, 6594)
y_pred = model.predict(new_query).flatten()
print(y_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[0.9998115]
