**SQL Injection Detection using Random Forest**

Import important packages

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


Import dataset ("sql.xlsx")

In [3]:
df = pd.read_excel("sql.xlsx")
df.head(10), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   73 non-null     object
 1   label   73 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.3+ KB


(                                               query  label
 0                  " or pg_sleep  (  __TIME__  )  --      1
 1  create user name identified by pass123 tempora...      1
 2   AND 1  =  utl_inaddr.get_host_address   (    ...      1
 3   select * from users where id  =  '1' or @ @1 ...      1
 4   select * from users where id  =  1 or 1#"  ( ...      1
 5   select name from syscolumns where id   =     ...      1
 6  select * from users where id  =  1 +$+ or 1  =...      1
 7  1;  (  load_file  (  char  (  47,101,116,99,47...      1
 8   select * from users where id  =  '1' or ||/1 ...      1
 9   select * from users where id  =  '1' or \.<\ ...      1,
 None)

**Train Dataset Using Random Forest**

In [4]:
# Convert all entries in 'query' to string type in the entire DataFrame
df['query'] = df['query'].astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['query'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

SQL Injection Detection Using Random Forest Output

In [5]:
# Save model
with open('sql.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


with open('tfidf_vector.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [6]:
def sql_detect(query):
    with open('sql.pkl', 'rb') as model_file:
        load_model = pickle.load(model_file)

    with open('tfidf_vector.pkl', 'rb') as vectorizer_file:
        load_vectorizer =  pickle.load(vectorizer_file)

    query_tfidf = load_vectorizer.transform([query])

    predict = load_model.predict(query_tfidf)
    return "SQL Injection Detected" if predict[0] == 1 else "Safe"

print(sql_detect("? or 1  =  1 --"))

SQL Injection Detected
