In [4]:
import pandas as pd
from scipy.sparse import hstack
import joblib

# File paths
new_logs_csv = './Dataset/converted_sqli.csv'  # Path to the new -logs CSV file
model_path = './Models/logistic_regression_model.pkl'      # Path to the trained model (.pkl file)
output_csv = './predictions/logistic_regression_prediction.csv'  # Path to save predictions
vectorizer_path = './Models/logistic_regression_vectorizer.pkl'
scaler_path = './Models/logistic_regression_scaler.pkl'

# Step 1: Load the saved components
model = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)
scaler = joblib.load(scaler_path)

# Step 2: Load new logs
new_logs = pd.read_csv(new_logs_csv, encoding='latin1', on_bad_lines='skip')
new_logs.columns = new_logs.columns.str.strip().str.replace(';', '')

# Ensure columns match the expected ones
numeric_features = [
    'query_len', 'num_words_query', 'no_single_qts', 'no_double_qts', 'no_punct',
    'no_single_cmnt', 'no_mult_cmnt', 'no_space', 'no_perc', 'no_log_opt',
    'no_arith', 'no_null', 'no_hexa', 'no_alpha', 'no_digit', 'len_of_chr_char_null', 'genuine_keywords'
]
X_text_new = new_logs['Sentence']
X_numeric_new = new_logs[numeric_features]

# Step 3: Preprocess new logs
# Transform text data
X_text_new_tfidf = vectorizer.transform(X_text_new)

# Scale numeric features
X_numeric_new_scaled = scaler.transform(X_numeric_new)

# Combine text and numeric features
X_new_combined = hstack([X_text_new_tfidf, X_numeric_new_scaled])

# Step 4: Make predictions
predictions = model.predict(X_new_combined)

# Step 5: Save predictions to a CSV file
output = pd.DataFrame({'Prediction': predictions})
output.to_csv(output_csv, index=False)

print(f"Predictions saved to {output_csv}")

Predictions saved to ./predictions/logistic_regression_prediction.csv


In [1]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score , confusion_matrix

# Load the original dataset
original_data = pd.read_csv('./Dataset/converted_sqli.csv')  # Replace with your file path

# Load the predictions dataset
predictions_data = pd.read_csv('./predictions/logistic_regression_prediction.csv')  # Replace with your file path

# Ensure the datasets are aligned (e.g., by index)
# If the datasets are not aligned, you may need to merge them on a common column
# Example: merged_data = pd.merge(original_data, predictions_data, on='common_column')

# Extract actual labels (y_true) and predicted labels (y_pred)
y_true = original_data['Label']  # Replace 'Label' with the actual column name
y_pred = predictions_data['Prediction']  # Replace 'Prediction' with the actual column name

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
TN, FP, FN, TP = cm.ravel()
print("Confusion Matrix:\n")
print(f"True Negative (TN): {TN}")
print(f"False Positive (FP): {FP}")
print(f"False Negative (FN): {FN}")
print(f"True Positive (TP): {TP}")
# Classification Report
print("Classification Report:\n", classification_report(y_true, y_pred))

Test Accuracy: 0.9723
Confusion Matrix:

True Negative (TN): 2956
False Positive (FP): 33
False Negative (FN): 72
True Positive (TP): 726
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      2989
           1       0.96      0.91      0.93       798

    accuracy                           0.97      3787
   macro avg       0.97      0.95      0.96      3787
weighted avg       0.97      0.97      0.97      3787

