In [1]:
# AI-Powered Phishing URL Detector
# Notebook 3: Model Development

# ## 3.1 Import Libraries
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# ## 3.2 Load Processed Data
# Load the training and testing sets that we created in the previous notebook.

processed_data_path = '../data/processed/'
models_path = '../models/'
os.makedirs(models_path, exist_ok=True) # Ensure the models directory exists

try:
    print("Loading processed data...")
    X_train = pd.read_csv(os.path.join(processed_data_path, 'X_train.csv'))
    X_test = pd.read_csv(os.path.join(processed_data_path, 'X_test.csv'))
    y_train = np.load(os.path.join(processed_data_path, 'y_train.npy'))
    y_test = np.load(os.path.join(processed_data_path, 'y_test.npy'))
    print("Data loaded successfully.")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
except FileNotFoundError:
    print("Error: Processed data files not found. Please run Notebook 2 first.")
    # Exit or handle the error appropriately
    X_train, X_test, y_train, y_test = [None]*4


# ## 3.3 Model Training and Evaluation

if X_train is not None:
    # Initialize the models
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "Support Vector Machine": SVC(kernel='linear', random_state=42)
    }

    # Dictionary to store the results
    results = {}

    # Train and evaluate each model
    for name, model in models.items():
        print(f"--- Training {name} ---")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Store the results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "Model": model # Store the trained model object
        }
        
        print(f"--- Evaluation for {name} ---")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}\n")
        
        print(f"Classification Report for {name}:\n")
        print(classification_report(y_test, y_pred, target_names=['legitimate', 'phishing']))
        print("-" * 50 + "\n")


# ## 3.4 Compare Models and Save the Best One

if 'results' in locals() and results:
    # Find the best model based on F1-Score
    best_model_name = max(results, key=lambda name: results[name]['F1-Score'])
    best_model_performance = results[best_model_name]
    best_model_object = best_model_performance['Model']
    
    print(f"--- Model Comparison ---")
    for name, metrics in results.items():
        print(f"{name}:")
        print(f"  Accuracy: {metrics['Accuracy']:.4f}")
        print(f"  F1-Score: {metrics['F1-Score']:.4f}")
        
    print("\n--- Best Model ---")
    print(f"The best performing model is: {best_model_name}")
    print(f"  - Accuracy: {best_model_performance['Accuracy']:.4f}")
    print(f"  - Precision: {best_model_performance['Precision']:.4f}")
    print(f"  - Recall: {best_model_performance['Recall']:.4f}")
    print(f"  - F1-Score: {best_model_performance['F1-Score']:.4f}")
    
    # Save the best model to a file using pickle
    model_filename = 'best_model.pkl'
    model_filepath = os.path.join(models_path, model_filename)
    
    with open(model_filepath, 'wb') as f:
        pickle.dump(best_model_object, f)
        
    print(f"\nBest model ({best_model_name}) has been saved to: {model_filepath}")
else:
    print("Model training was not completed. Skipping model comparison and saving.")


Loading processed data...
Data loaded successfully.
X_train shape: (9144, 13)
X_test shape: (2286, 13)
--- Training Logistic Regression ---
--- Evaluation for Logistic Regression ---
Accuracy: 0.7410
Precision: 0.7683
Recall: 0.6903
F1-Score: 0.7272

Classification Report for Logistic Regression:

              precision    recall  f1-score   support

  legitimate       0.72      0.79      0.75      1143
    phishing       0.77      0.69      0.73      1143

    accuracy                           0.74      2286
   macro avg       0.74      0.74      0.74      2286
weighted avg       0.74      0.74      0.74      2286

--------------------------------------------------

--- Training Random Forest ---
--- Evaluation for Random Forest ---
Accuracy: 0.8469
Precision: 0.8451
Recall: 0.8495
F1-Score: 0.8473

Classification Report for Random Forest:

              precision    recall  f1-score   support

  legitimate       0.85      0.84      0.85      1143
    phishing       0.85      0.85  