In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, MaxPooling1D, GRU, Flatten
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns
import os

# 1. Load and Prepare Data
def load_and_prepare_data(file_path):
    """Load features and prepare for training with validation"""
    df = pd.read_csv(file_path)
    
    # Check label distribution
    print("Label Distribution:")
    label_counts = df['label'].value_counts()
    print(label_counts)
    print(f"Percentage of Phishing (1): {label_counts.get(1, 0) / len(df) * 100:.2f}%")
    print(f"Percentage of Legitimate (0): {label_counts.get(0, 0) / len(df) * 100:.2f}%")
    
    if not df['label'].isin([0, 1]).all():
        raise ValueError("Labels contain values other than 0 or 1")
    
    feature_columns = [col for col in df.columns if col not in ['url', 'label']]
    X = df[feature_columns].values
    y = df['label'].values
    
    # Normalize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    return X_train, X_test, y_train, y_test, scaler

# 2. Model Definitions
def create_lstm_model(input_dim):
    """Create an LSTM-based model"""
    model = Sequential([
        LSTM(64, input_shape=(input_dim, 1), return_sequences=True),
        LSTM(32),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model, "LSTM Model"

def create_cnn_model(input_dim):
    """Create a CNN-based model"""
    model = Sequential([
        Conv1D(64, 3, activation='relu', input_shape=(input_dim, 1), padding='same'),
        MaxPooling1D(2),
        Conv1D(32, 3, activation='relu', padding='same'),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model, "CNN Model"

def create_gru_model(input_dim):
    """Create a GRU-based model"""
    model = Sequential([
        GRU(64, input_shape=(input_dim, 1), return_sequences=True),
        GRU(32),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model, "GRU Model"

# 3. Calculate False Positive Rate
def calculate_fpr(y_true, y_pred):
    """Calculate False Positive Rate from true labels and predictions"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return fpr

# 4. Visualization Function
def plot_comparison_metrics(histories, model_names, fprs, X_test, y_test, models, output_dir='plots'):
    """Plot comparison metrics for all models"""
    os.makedirs(output_dir, exist_ok=True)
    
    # Plot accuracy comparison
    plt.figure(figsize=(12, 8))
    for history, name in zip(histories, model_names):
        plt.plot(history.history['val_accuracy'], label=f'{name} Val Accuracy')
    plt.title('Validation Accuracy Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{output_dir}/val_accuracy_comparison.png')
    plt.close()
    
    # Plot loss comparison
    plt.figure(figsize=(12, 8))
    for history, name in zip(histories, model_names):
        plt.plot(history.history['val_loss'], label=f'{name} Val Loss')
    plt.title('Validation Loss Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{output_dir}/val_loss_comparison.png')
    plt.close()
    
    # Plot confusion matrices for all models
    for model, name in zip(models, model_names):
        y_pred = (model.predict(X_test) > 0.5).astype(int)
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.savefig(f'{output_dir}/confusion_matrix_{name.lower().replace(" ", "_")}.png')
        plt.close()

# 5. Main Training Function
def train_and_compare_models(features_file, epochs=15):
    """Train and compare multiple models, deploy the best one"""
    print("Loading and preparing data...")
    X_train, X_test, y_train, y_test, scaler = load_and_prepare_data(features_file)
    
    # Reshape for deep learning models
    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
    
    # Initialize models
    model_creators = [create_lstm_model, create_cnn_model, create_gru_model]
    #model_creators = [create_lstm_model]
    models = []
    model_names = []
    histories = []
    results = []
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    class_weights = {0: 1.0, 1: len(y_train) / (3 * np.sum(y_train))}  # Adjusted weight for imbalance
    
    # Train each model
    for creator in model_creators:
        model, name = creator(X_train.shape[1])
        print(f"\nTraining {name}")
        model.summary()
        
        history = model.fit(
            X_train_reshaped, y_train,
            epochs=epochs,
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stopping],
            class_weight=class_weights,
            verbose=1
        )
        
        # Evaluate model
        test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=0)
        y_pred = (model.predict(X_test_reshaped) > 0.5).astype(int)
        fpr = calculate_fpr(y_test, y_pred)
        
        models.append(model)
        model_names.append(name)
        histories.append(history)
        results.append({
            'name': name,
            'test_accuracy': test_accuracy,
            'test_loss': test_loss,
            'val_accuracy': history.history['val_accuracy'][-1],
            'fpr': fpr
        })
    
    # Print comparison results
    print("\nModel Comparison Results:")
    for result in results:
        print(f"\n{result['name']}:")
        print(f"Test Accuracy: {result['test_accuracy']:.4f}")
        print(f"Test Loss: {result['test_loss']:.4f}")
        print(f"Validation Accuracy: {result['val_accuracy']:.4f}")
        print(f"False Positive Rate: {result['fpr']:.4f}")
    
    # Find the best model based on validation accuracy
    best_model_idx = np.argmax([r['val_accuracy'] for r in results])
    best_model = models[best_model_idx]
    best_model_name = model_names[best_model_idx]
    
    # Deploy the best model
    save_path = f"./models/best_model_{best_model_name.lower().replace(' ', '_')}.h5"
    best_model.save(save_path)
    np.save('scaler.npy', scaler)
    print(f"\nBest model ({best_model_name}) saved to {save_path}")
    
    # Visualize results
    plot_comparison_metrics(histories, model_names, [r['fpr'] for r in results], X_test_reshaped, y_test, models)
    
    return models, histories, results

if __name__ == "__main__":
    FEATURES_FILE = "./data/extracted_features.csv"
    models, histories, results = train_and_compare_models(FEATURES_FILE, epochs=10)

Loading and preparing data...
Label Distribution:
label
0    1002880
1     285380
Name: count, dtype: int64
Percentage of Phishing (1): 22.15%
Percentage of Legitimate (0): 77.85%

Training LSTM Model


  super().__init__(**kwargs)


Epoch 1/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m648s[0m 25ms/step - accuracy: 0.9849 - loss: 0.0571 - val_accuracy: 0.9983 - val_loss: 0.0045
Epoch 2/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m638s[0m 25ms/step - accuracy: 0.9980 - loss: 0.0056 - val_accuracy: 0.9981 - val_loss: 0.0090
Epoch 3/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m637s[0m 25ms/step - accuracy: 0.9983 - loss: 0.0047 - val_accuracy: 0.9986 - val_loss: 0.0033
Epoch 4/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m639s[0m 25ms/step - accuracy: 0.9988 - loss: 0.0037 - val_accuracy: 0.9989 - val_loss: 0.0029
Epoch 5/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m641s[0m 25ms/step - accuracy: 0.9991 - loss: 0.0028 - val_accuracy: 0.9993 - val_loss: 0.0017
Epoch 6/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m645s[0m 25ms/step - accuracy: 0.9994 - loss: 0.0022 - val_accuracy: 0.9995 - val

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 6ms/step - accuracy: 0.9972 - loss: 0.0114 - val_accuracy: 0.9993 - val_loss: 0.0023
Epoch 2/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 6ms/step - accuracy: 0.9992 - loss: 0.0028 - val_accuracy: 0.9995 - val_loss: 0.0015
Epoch 3/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 7ms/step - accuracy: 0.9994 - loss: 0.0021 - val_accuracy: 0.9994 - val_loss: 0.0017
Epoch 4/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 7ms/step - accuracy: 0.9995 - loss: 0.0022 - val_accuracy: 0.9996 - val_loss: 0.0014
Epoch 5/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 7ms/step - accuracy: 0.9995 - loss: 0.0021 - val_accuracy: 0.9996 - val_loss: 0.0022
Epoch 6/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 7ms/step - accuracy: 0.9997 - loss: 0.0019 - val_accuracy: 0.9995 - val_loss:

  super().__init__(**kwargs)


Epoch 1/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m757s[0m 29ms/step - accuracy: 0.9901 - loss: 0.0388 - val_accuracy: 0.9986 - val_loss: 0.0048
Epoch 2/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m756s[0m 29ms/step - accuracy: 0.9984 - loss: 0.0051 - val_accuracy: 0.9994 - val_loss: 0.0022
Epoch 3/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m760s[0m 30ms/step - accuracy: 0.9991 - loss: 0.0030 - val_accuracy: 0.9995 - val_loss: 0.0016
Epoch 4/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 30ms/step - accuracy: 0.9994 - loss: 0.0024 - val_accuracy: 0.9992 - val_loss: 0.0026
Epoch 5/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m766s[0m 30ms/step - accuracy: 0.9994 - loss: 0.0020 - val_accuracy: 0.9995 - val_loss: 0.0019
Epoch 6/10
[1m25766/25766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m766s[0m 30ms/step - accuracy: 0.9994 - loss: 0.0021 - val_accuracy: 0.9995 - val




Model Comparison Results:

LSTM Model:
Test Accuracy: 0.9995
Test Loss: 0.0015
Validation Accuracy: 0.9995
False Positive Rate: 0.0005

CNN Model:
Test Accuracy: 0.9996
Test Loss: 0.0014
Validation Accuracy: 0.9997
False Positive Rate: 0.0003

GRU Model:
Test Accuracy: 0.9996
Test Loss: 0.0012
Validation Accuracy: 0.9996
False Positive Rate: 0.0004

Best model (CNN Model) saved to ./models/best_model_cnn_model.h5
[1m8052/8052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 8ms/step
[1m8052/8052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step
[1m8052/8052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 9ms/step
