In [13]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import joblib

# File paths
new_logs_csv = './Dataset/converted_sqli.csv'
model_path = './Models/CNN_model.pkl'
output_csv = './predictions/CNN_prediction.csv'
vectorizer_path = './Models/CNN_vectorizer.pkl'
scaler_path = './Models/CNN_scaler.pkl'

# Step 1: Load saved components
model = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)
scaler = joblib.load(scaler_path)

# Step 2: Load new logs
new_logs = pd.read_csv(new_logs_csv, encoding='latin1', on_bad_lines='skip')
new_logs.columns = new_logs.columns.str.strip().str.replace(';', '')

# Ensure columns match the expected ones
numeric_features = [
    'query_len', 'num_words_query', 'no_single_qts', 'no_double_qts', 'no_punct',
    'no_single_cmnt', 'no_mult_cmnt', 'no_space', 'no_perc', 'no_log_opt',
    'no_arith', 'no_null', 'no_hexa', 'no_alpha', 'no_digit', 'len_of_chr_char_null', 'genuine_keywords'
]
X_text_new = new_logs['Sentence'].astype(str)  # Ensure it's a string
X_numeric_new = new_logs[numeric_features]

# Step 3: Preprocess new logs
X_text_new_tfidf = vectorizer.transform(X_text_new)
X_numeric_new_scaled = scaler.transform(X_numeric_new)

# **Fix: Convert sparse to dense and reshape for CNN**
X_new_combined = np.hstack([X_text_new_tfidf.toarray(), X_numeric_new_scaled])
X_new_combined = X_new_combined.reshape(X_new_combined.shape[0], X_new_combined.shape[1], 1)

# Step 4: Make predictions
predictions = model.predict(X_new_combined)
predicted_labels = np.argmax(predictions, axis=1)  # Convert one-hot to labels

# Step 5: Save predictions to CSV
output = pd.DataFrame({'Prediction': predicted_labels})
output.to_csv(output_csv, index=False)

print(f"Predictions saved to {output_csv}")




[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step
Predictions saved to ./predictions/CNN_prediction.csv


In [3]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score , confusion_matrix

# Load the original dataset
original_data = pd.read_csv('./Dataset/converted_sqli.csv')  # Replace with your file path

# Load the predictions dataset
predictions_data = pd.read_csv('./predictions/CNN_prediction.csv')  # Replace with your file path

# Ensure the datasets are aligned (e.g., by index)
# If the datasets are not aligned, you may need to merge them on a common column
# Example: merged_data = pd.merge(original_data, predictions_data, on='common_column')

# Extract actual labels (y_true) and predicted labels (y_pred)
y_true = original_data['Label']  # Replace 'Label' with the actual column name
y_pred = predictions_data['Prediction']  # Replace 'Prediction' with the actual column name


accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
TN, FP, FN, TP = cm.ravel()
print("Confusion Matrix:\n")
print(f"True Negative (TN): {TN}")
print(f"False Positive (FP): {FP}")
print(f"False Negative (FN): {FN}")
print(f"True Positive (TP): {TP}")
# Classification Report
print("Classification Report:\n", classification_report(y_true, y_pred))


Test Accuracy: 0.9881
Confusion Matrix:

True Negative (TN): 2969
False Positive (FP): 20
False Negative (FN): 25
True Positive (TP): 773
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      2989
           1       0.97      0.97      0.97       798

    accuracy                           0.99      3787
   macro avg       0.98      0.98      0.98      3787
weighted avg       0.99      0.99      0.99      3787

