<a href="https://colab.research.google.com/github/MorganChidley/Final-Year-Project/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import mutual_info_classif
from IPython.display import display

# ========================
# 1. Load the Dataset
# ========================
data = pd.read_csv("modified_dataset1.csv")

# ========================
# 2. Define Features and Target
# ========================
X = data.drop(columns=['Label', 'URL'])
y = data['Label']

# One-hot encode 'tld' column
X = pd.get_dummies(X, columns=['tld'], drop_first=True)

# ========================
# 3. Mutual Information - Feature Selection
# ========================
mutual_info = mutual_info_classif(X, y)

# Create a dataframe to inspect feature importance
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mutual_info
}).sort_values(by='Mutual_Information', ascending=False)

# Select Top N Features (example: Top 20)
top_features = feature_scores['Feature'].head(20).tolist()

# Filter X to keep only top features
X_selected = X[top_features]

display(feature_scores)

# ========================
# 4. Train-Test Split
# ========================
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# ========================
# 5. Model Pipelines
# ========================
# We will wrap preprocessing (scaling) and classifier into pipelines
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=500),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    "SVM": SVC(probability=True, class_weight='balanced'),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu',
                         solver='adam', alpha=0.0001, learning_rate_init=0.001,
                         early_stopping=True, max_iter=1000)
}

pipelines = {
    model_name: Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    for model_name, model in models.items()
}

# ========================
# 6. Cross-Validation Results (K-Fold)
# ========================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

kfold_results = []
for model_name, pipeline in pipelines.items():
    cv_scores = cross_val_score(pipeline, X_selected, y, cv=kf, scoring='accuracy')
    kfold_results.append({
        'Model': model_name,
        'Mean Accuracy': cv_scores.mean(),
        'Std Dev': cv_scores.std()
    })

kfold_results_df = pd.DataFrame(kfold_results)
display(kfold_results_df)

# ========================
# 7. Train, Predict, Evaluate
# ========================
results = []
for model_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"\n==== {model_name} ====")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("ROC-AUC:", roc_auc)
    print("Confusion Matrix:\n", conf_matrix)

    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    })

# ========================
# 8. Results DataFrame
# ========================
results_df = pd.DataFrame(results).drop(columns=['Confusion Matrix'])
display(results_df)

# ========================
# 9. OPTIONAL: Save Results
# ========================
results_df.to_csv("Model_Performance_Results.csv", index=False)
kfold_results_df.to_csv("KFold_CV_Results.csv", index=False)

print("\n✅ Finished! Results saved to CSV.")


Unnamed: 0,Feature,Mutual_Information
9,digit_count,0.067831
16,has_suspicious_words,0.055393
21,char_.,0.051824
11,digit_letter_ratio,0.04933
12,url_entropy,0.048265
0,url_length,0.036787
10,longest_digit_sequence,0.034375
20,char_/,0.028529
17,char_-,0.021707
6,has_ip_address,0.01953


Unnamed: 0,Model,Mean Accuracy,Std Dev
0,Logistic Regression,0.707555,0.009839
1,Decision Tree,0.752863,0.003495
2,Random Forest,0.794956,0.005598
3,SVM,0.76954,0.007018
4,MLP,0.796867,0.00985



==== Logistic Regression ====
Accuracy: 0.7106981416373681
Precision: 0.6781701444622793
Recall: 0.8284313725490197
F1-Score: 0.7458075904677847
ROC-AUC: 0.8111912118091316
Confusion Matrix:
 [[570 401]
 [175 845]]

==== Decision Tree ====
Accuracy: 0.7513812154696132
Precision: 0.7541142303969022
Recall: 0.7637254901960784
F1-Score: 0.7588894301022894
ROC-AUC: 0.7544991013913289
Confusion Matrix:
 [[717 254]
 [241 779]]

==== Random Forest ====
Accuracy: 0.7945755901557007
Precision: 0.8095238095238095
Recall: 0.7833333333333333
F1-Score: 0.7962132536123567
ROC-AUC: 0.8787716322368289
Confusion Matrix:
 [[783 188]
 [221 799]]

==== SVM ====
Accuracy: 0.7810145655449523
Precision: 0.7362459546925566
Recall: 0.8921568627450981
F1-Score: 0.8067375886524822
ROC-AUC: 0.8664329274449223
Confusion Matrix:
 [[645 326]
 [110 910]]

==== MLP ====
Accuracy: 0.779507785032647
Precision: 0.7591436217662801
Recall: 0.8343137254901961
F1-Score: 0.7949556282111163
ROC-AUC: 0.8663985985743422
Confusi

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC-AUC
0,Logistic Regression,0.710698,0.67817,0.828431,0.745808,0.811191
1,Decision Tree,0.751381,0.754114,0.763725,0.758889,0.754499
2,Random Forest,0.794576,0.809524,0.783333,0.796213,0.878772
3,SVM,0.781015,0.736246,0.892157,0.806738,0.866433
4,MLP,0.779508,0.759144,0.834314,0.794956,0.866399



✅ Finished! Results saved to CSV.


In [None]:
# Dataset is loaded successfully! Let's proceed with the full machine learning pipeline.

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt

# =============== STEP 1: Features and Target ===============
X = data.drop(columns=['Label', 'URL'])
y = data['Label']

# One-hot encode 'tld' if it exists (checking if column is present)
if 'tld' in X.columns:
    X = pd.get_dummies(X, columns=['tld'], drop_first=True)

# =============== STEP 2: Mutual Information Feature Selection ===============
mutual_info = mutual_info_classif(X, y)

# Create a dataframe to inspect feature importance
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mutual_info
}).sort_values(by='Mutual_Information', ascending=False)

# Select Top 20 Features (adjustable)
top_features = feature_scores['Feature'].head(20).tolist()
X_selected = X[top_features]

# =============== STEP 3: Train-Test Split ===============
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# =============== STEP 4: Define Models ===============
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=500),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    "SVM": SVC(probability=True, class_weight='balanced'),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu',
                         solver='adam', alpha=0.0001, learning_rate_init=0.001,
                         early_stopping=True, max_iter=1000)
}

# =============== STEP 5: Parameter Grids ===============
param_grids = {
    "Logistic Regression": {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    "Decision Tree": {
        'classifier__max_depth': [5, 10, 20, None],
        'classifier__min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5, 10]
    },
    "SVM": {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    "MLP": {
        'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'classifier__activation': ['relu', 'tanh'],
        'classifier__solver': ['adam', 'sgd'],
        'classifier__alpha': [0.0001, 0.001, 0.01]
    }
}

# =============== STEP 6: Hyperparameter Tuning + Evaluation ===============
best_models = {}
evaluation_results = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():
    print(f"\n🔧 Tuning {model_name}...")

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

    param_grid = param_grids[model_name]

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=kf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )

    grid_search.fit(X_train, y_train)

    best_models[model_name] = grid_search.best_estimator_

    # Predict & Evaluate
    y_pred = grid_search.predict(X_test)
    y_proba = grid_search.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    evaluation_results.append({
        'Model': model_name,
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    })

# =============== STEP 7: Results DataFrame ===============
evaluation_df = pd.DataFrame(evaluation_results)
display(evaluation_df)

# =============== STEP 8: Visualization ===============
plt.figure(figsize=(10, 6))

# ROC Curve and Precision-Recall Curve for each best model
for model_name, model in best_models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    precision, recall, _ = precision_recall_curve(y_test, y_proba)

    # ROC Curve
    plt.plot(fpr, tpr, label=f'{model_name} (ROC-AUC: {roc_auc_score(y_test, y_proba):.2f})')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title('ROC Curves for Best Models')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

# Precision-Recall Curves
plt.figure(figsize=(10, 6))

for model_name, model in best_models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_proba)

    plt.plot(recall, precision, label=f'{model_name}')

plt.title('Precision-Recall Curves for Best Models')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# =============== STEP 1: Train-Test Split (if not already split) ===============
X = data.drop(columns=['Label', 'URL'])
y = data['Label']

# One-hot encode 'tld' if exists
if 'tld' in X.columns:
    X = pd.get_dummies(X, columns=['tld'], drop_first=True)

# Use Top 20 Features from Mutual Information
mutual_info = mutual_info_classif(X, y)
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mutual_info
}).sort_values(by='Mutual_Information', ascending=False)

top_features = feature_scores['Feature'].head(20).tolist()
X_selected = X[top_features]

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# =============== STEP 2: Initialize Base Models ===============
# Simplified models from earlier best parameters (you can adjust!)
log_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(C=10, solver='liblinear', class_weight='balanced', max_iter=500))
])

dt_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced'))
])

rf_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, class_weight='balanced'))
])

svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(C=10, kernel='rbf', probability=True, class_weight='balanced'))
])

mlp_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu',
                                 solver='adam', alpha=0.0001, early_stopping=True, max_iter=1000))
])

# =============== STEP 3: Create the Voting Classifier ===============
ensemble = VotingClassifier(
    estimators=[
        ('Logistic Regression', log_clf),
        ('Decision Tree', dt_clf),
        ('Random Forest', rf_clf),
        ('SVM', svm_clf),
        ('MLP', mlp_clf)
    ],
    voting='soft',  # soft voting = uses predicted probabilities
    n_jobs=-1
)

# =============== STEP 4: Fit the Ensemble Model ===============
ensemble.fit(X_train, y_train)

# =============== STEP 5: Evaluate Ensemble ===============
y_pred = ensemble.predict(X_test)
y_proba = ensemble.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("🔮 Ensemble Voting Classifier Performance 🔮")
print(f"Accuracy     : {accuracy:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1-Score     : {f1:.4f}")
print(f"ROC-AUC      : {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)

# =============== STEP 6: Plot ROC Curve for the Ensemble ===============
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Ensemble (ROC-AUC: {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Ensemble Voting Classifier')
plt.legend()
plt.grid(True)
plt.show()

# Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, label='Ensemble')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Ensemble Voting Classifier')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import json

# STEP 1: Prepare your data
X = data.drop(columns=['Label', 'URL'])
y = data['Label']

# One-hot encode 'tld' if it exists
if 'tld' in X.columns:
    X = pd.get_dummies(X, columns=['tld'], drop_first=True)

# Feature selection (Top 20 from mutual information)
from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(X, y)
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mutual_info
}).sort_values(by='Mutual_Information', ascending=False)

top_features = feature_scores['Feature'].head(20).tolist()
print("✅ Features used for training the model:")
for i, feature in enumerate(top_features, start=1):
    print(f"{i}. {feature}")

X_selected = X[top_features]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# STEP 2: Define base models (pipelines with scalers)
estimators = [
    ('lr', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(C=10, solver='liblinear', class_weight='balanced', max_iter=500))
    ])),
    ('dt', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced'))
    ])),
    ('rf', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, class_weight='balanced'))
    ])),
    ('svm', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(C=10, kernel='rbf', probability=True, class_weight='balanced'))
    ])),
    ('mlp', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu',
                                     solver='adam', alpha=0.0001, early_stopping=True, max_iter=1000))
    ]))
]

# STEP 3: Define the meta-model
meta_model = LogisticRegression()

# STEP 4: Create the StackingClassifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',  # Use probabilities for soft classification
    n_jobs=-1,
    passthrough=False  # True if you want to add original features to meta-learner
)

# STEP 5: Train the stacked ensemble
stacking_clf.fit(X_train, y_train)

# ✅ Save the trained stacked model (ADD THIS AFTER TRAINING)
model_filename = 'stacked_phishing_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(stacking_clf, file)

print(f"✅ Model saved as {model_filename}")

# Save the features list to a JSON file in Colab
with open('feature_columns.json', 'w') as f:
    json.dump(top_features, f)

print("✅ feature_columns.json saved in Colab directory!")

# STEP 6: Evaluate the stacked model
y_pred = stacking_clf.predict(X_test)
y_proba = stacking_clf.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred)



# STEP 7: Print performance
print("🔮 Stacked Ensemble Performance 🔮")
print(f"Accuracy     : {accuracy:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1-Score     : {f1:.4f}")
print(f"ROC-AUC      : {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)

# STEP 8: ROC Curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Stacked Ensemble (ROC-AUC: {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Stacked Ensemble')
plt.legend()
plt.grid(True)
plt.show()

# Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, label='Stacked Ensemble')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Stacked Ensemble')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, f1_score, confusion_matrix

# Get prediction probabilities from the Stacked Ensemble
y_proba = stacking_clf.predict_proba(X_test)[:, 1]

# Calculate precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Calculate F1 scores for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)

# Plot Precision-Recall vs Threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.plot(thresholds, f1_scores[:-1], label='F1 Score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision, Recall, F1 Score vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

# Example: Set a custom threshold from the graph
custom_threshold = 0.6  # (you can adjust this based on the chart)

# Predict with custom threshold
y_pred_custom = (y_proba >= custom_threshold).astype(int)

# Evaluate the new predictions
accuracy = accuracy_score(y_test, y_pred_custom)
precision_custom = precision_score(y_test, y_pred_custom)
recall_custom = recall_score(y_test, y_pred_custom)
f1_custom = f1_score(y_test, y_pred_custom)
conf_matrix_custom = confusion_matrix(y_test, y_pred_custom)

print(f"🔧 Custom Threshold = {custom_threshold}")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision_custom:.4f}")
print(f"Recall    : {recall_custom:.4f}")
print(f"F1-Score  : {f1_custom:.4f}")
print("\nConfusion Matrix:\n", conf_matrix_custom)


In [None]:
# ✅ Updated Notebook Code to Include 'domain_length', 'https', and 'subdomain_count' Features

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import json

# ✅ Step 1: Load and Prepare the Dataset
# Replace 'your_dataset.csv' with your dataset path
data = pd.read_csv('your_dataset.csv')  # Load your dataset here

# Drop unnecessary columns if needed
# data = data.drop(columns=['URL'])  # Optional

# ✅ Step 2: Feature Engineering (Making sure these features exist)
# Assume your dataset already includes these features:
# 'domain_length', 'https', 'subdomain_count'

# If not, here's an example of how you could create them:
# data['domain_length'] = data['Domain'].apply(lambda x: len(x))
# data['https'] = data['URL'].apply(lambda x: 1 if x.lower().startswith('https') else 0)
# data['subdomain_count'] = data['Domain'].apply(lambda x: x.count('.') - 1)

# ✅ Step 3: Define X and y
X = data.drop(columns=['Label', 'URL'])  # Drop label and URL if they exist
y = data['Label']

# ✅ Step 4: One-hot encode TLD if it exists
if 'tld' in X.columns:
    X = pd.get_dummies(X, columns=['tld'], drop_first=True)

# ✅ Step 5: Add Mandatory Features to Top Features
# Compute mutual information scores
mutual_info = mutual_info_classif(X, y)
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mutual_info
}).sort_values(by='Mutual_Information', ascending=False)

# Get top features (ensuring 'domain_length', 'https', 'subdomain_count' are included)
mandatory_features = ['domain_length', 'https', 'subdomain_count']
top_features = feature_scores['Feature'].tolist()

# Make sure mandatory features are present
for feature in mandatory_features:
    if feature not in top_features:
        top_features.insert(0, feature)  # Add at the beginning

# Limit to top 20 features (or more if mandatory features increase the list)
top_features = list(dict.fromkeys(top_features))[:20]

print("✅ Features selected for training:")
for idx, feature in enumerate(top_features, start=1):
    print(f"{idx}. {feature}")

# ✅ Step 6: Prepare data for training
X_selected = X[top_features]

# ✅ Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

# ✅ Step 8: Define base models with pipelines
estimators = [
    ('lr', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(C=10, solver='liblinear', class_weight='balanced', max_iter=500))
    ])),
    ('dt', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced'))
    ])),
    ('rf', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, class_weight='balanced'))
    ])),
    ('svm', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(C=10, kernel='rbf', probability=True, class_weight='balanced'))
    ])),
    ('mlp', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.0001, early_stopping=True, max_iter=1000))
    ]))
]

# ✅ Step 9: Define meta-model
meta_model = LogisticRegression()

# ✅ Step 10: Create and train the StackingClassifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1,
    passthrough=False
)

stacking_clf.fit(X_train, y_train)

# ✅ Step 11: Evaluate the model
y_pred = stacking_clf.predict(X_test)
y_proba = stacking_clf.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\n🔮 Model Performance 🔮")
print(f"Accuracy     : {accuracy:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1-Score     : {f1:.4f}")
print(f"ROC-AUC      : {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)

# ✅ Step 12: Save the trained model and feature list
with open('stacked_phishing_model.pkl', 'wb') as file:
    pickle.dump(stacking_clf, file)

with open('feature_columns.json', 'w') as f:
    json.dump(top_features, f)

print("✅ Model and feature_columns.json saved!")

# ✅ Step 13: Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Stacked Ensemble (ROC-AUC: {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Stacked Ensemble')
plt.legend()
plt.grid(True)
plt.show()

# ✅ Step 14: Plot Precision-Recall Curve
from sklearn.metrics import precision_recall_curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, label='Stacked Ensemble')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Stacked Ensemble')
plt.legend()
plt.grid(True)
plt.show()
