# Experiment 5: Ensemble Models

This notebook implements ensemble strategies for the phishing URL detection models developed in previous experiments.
It loads trained models from:
- Experiment 1: Linear Models (Logistic Regression, SVM)
- Experiment 2: Tree Models (XGBoost, LightGBM, CatBoost, Random Forest)
- Experiment 3: Neural Networks (MLP, CharCNN, BiLSTM, Hybrid)
- Experiment 4: Transformer Models (DeBERTa)

We will generate Out-of-Fold (OOF) predictions for each model and use them to train ensemble models (Stacking) or perform Averaging.

**Note:** This notebook includes a `LIGHTWEIGHT_MODE` flag. If set to `True`, it will skip the computationally expensive DeBERTa models for local execution.

In [None]:
# Configuration Flags
USE_DRIVE = False
LIGHTWEIGHT_MODE = True # Set to False to include DeBERTa models (requires GPU and time)

# Constants
RANDOM_STATE = 42
N_FOLDS = 5
EXPERIMENTS_DIR = "experiments"

# uncomment lines below if running on colab
from google.colab import drive
import os
drive.mount('/content/drive')
use_drive = True
drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
print(os.path.exists(drive_root)) # check path exists
EXPERIMENTS_DIR = os.path.join(drive_root, EXPERIMENTS_DIR)
USE_DRIVE = True
LIGHTWEIGHT_MODE = True

In [2]:
import os
import sys
import glob
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Scikit-learn
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Transformers
try:
    from transformers import AutoTokenizer, AutoModel, PreTrainedModel, DebertaV2Config, AutoConfig
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("Transformers library not found. DeBERTa models will be skipped.")
    LIGHTWEIGHT_MODE = True

# Setup Drive if needed
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
    if os.path.exists(drive_root):
        os.chdir(drive_root)
        print(f"Changed directory to {drive_root}")
    else:
        print(f"Drive root {drive_root} not found.")

# Verify experiments directory
if not os.path.exists(EXPERIMENTS_DIR):
    print(f"Warning: Experiments directory '{EXPERIMENTS_DIR}' not found.")
else:
    print(f"Experiments directory found: {os.path.abspath(EXPERIMENTS_DIR)}")

  from .autonotebook import tqdm as notebook_tqdm


Transformers library not found. DeBERTa models will be skipped.
Experiments directory found: /Users/winston/Documents/School/Y3S1/BT4012/Group Project/experiments


In [3]:
# Load Data
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

y = train_df['target'].values
X_numeric = train_w_features_df.drop(columns=['url', 'target', 'is_http', 'has_subdomain', 'has_tld']).select_dtypes(include=[np.number])
numeric_cols = X_numeric.columns.tolist()

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (9143, 2)
Test shape: (2286, 2)


## Model Class Definitions
We need to define the model architectures to load the saved weights for PyTorch and Transformer models.

In [4]:
# --- Neural Network Models (from exp_3) ---

class CharCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, num_filters=128, filter_sizes=[3, 4, 5], dropout=0.3, max_len=200):
        super(CharCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=num_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, 1)

    def forward(self, x_numeric, x_text, x_img=None):
        embedded = self.embedding(x_text)
        embedded = embedded.permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        dropped = self.dropout(cat)
        return torch.sigmoid(self.fc(dropped))

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128, num_layers=2, dropout=0.3):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=num_layers,
                            bidirectional=True,
                            batch_first=True,
                            dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x_numeric, x_text, x_img=None):
        embedded = self.embedding(x_text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        dropped = self.dropout(hidden_cat)
        return torch.sigmoid(self.fc(dropped))

class HybridModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, num_filters=128, filter_sizes=[3, 4, 5], dropout=0.3, max_len=200, input_dim=78, hidden_dims=[64, 32]):
        super(HybridModel, self).__init__()
        # Text CNN part
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=num_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout_cnn = nn.Dropout(dropout)
        self.fc_cnn = nn.Linear(len(filter_sizes) * num_filters, 64)

        # MLP part
        mlp_layers = []
        prev_dim = input_dim
        for dim in hidden_dims:
            mlp_layers.append(nn.Linear(prev_dim, dim))
            mlp_layers.append(nn.ReLU())
            mlp_layers.append(nn.Dropout(dropout))
            prev_dim = dim
        self.mlp_network = nn.Sequential(*mlp_layers)

        # Combined layers
        self.fc_combined = nn.Linear(64 + 32, 1)

    def forward(self, x_numeric, x_text, x_img=None):
        # Text CNN
        embedded = self.embedding(x_text)
        embedded = embedded.permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat_cnn = torch.cat(pooled, dim=1)
        dropped_cnn = self.dropout_cnn(cat_cnn)
        features_cnn = self.fc_cnn(dropped_cnn)

        # MLP
        features_mlp = self.mlp_network(x_numeric)

        # Combined
        combined = torch.cat([features_cnn, features_mlp], dim=1)
        return torch.sigmoid(self.fc_combined(combined))

# --- Transformer Models (from exp_4) ---

if TRANSFORMERS_AVAILABLE:
    class DebertaWithFeaturesConfig(DebertaV2Config):
        model_type = "deberta_with_features"
        def __init__(self, num_features=0, **kwargs):
            super().__init__(**kwargs)
            self.num_features = num_features

    class DebertaWithFeatures(PreTrainedModel):
        config_class = DebertaWithFeaturesConfig
        def __init__(self, config):
            super().__init__(config)
            self.deberta = AutoModel.from_pretrained("microsoft/deberta-v3-base") # Base model
            hidden = self.deberta.config.hidden_size
            self.feature_proj = nn.Sequential(
                nn.Linear(config.num_features, 128),
                nn.ReLU(),
                nn.Dropout(0.1)
            )
            self.classifier = nn.Linear(hidden + 128, 2)

        def forward(self, input_ids, attention_mask, features, labels=None):
            out = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
            cls = out.last_hidden_state[:, 0]
            feat_out = self.feature_proj(features)
            combined = torch.cat([cls, feat_out], dim=1)
            logits = self.classifier(combined)
            return logits

## Data Preparation Helpers
We need to recreate the datasets/dataloaders used in training to generate predictions.

In [5]:
# --- NN Data Preparation ---
class CharTokenizer:
    def __init__(self, max_len=200):
        self.char_map = {}
        self.max_len = max_len
        self.vocab_size = 0

    def fit(self, texts):
        unique_chars = set("".join(texts))
        # 0 is reserved for padding
        self.char_map = {char: i + 1 for i, char in enumerate(sorted(unique_chars))}
        self.vocab_size = len(self.char_map) + 1

    def transform(self, texts):
        sequences = []
        for text in texts:
            seq = [self.char_map.get(c, 0) for c in text[:self.max_len]]
            # Pad
            if len(seq) < self.max_len:
                seq += [0] * (self.max_len - len(seq))
            sequences.append(seq)
        return np.array(sequences)

class PhishingDataset(Dataset):
    def __init__(self, X_numeric, X_text, y=None):
        self.X_numeric = torch.FloatTensor(X_numeric)
        self.X_text = torch.LongTensor(X_text)
        self.y = torch.FloatTensor(y) if y is not None else None

    def __len__(self):
        return len(self.X_numeric)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X_numeric[idx], self.X_text[idx], self.y[idx]
        return self.X_numeric[idx], self.X_text[idx]

# --- Transformer Data Preparation ---
class URLWithFeaturesDataset(Dataset):
    def __init__(self, df, tokenizer, feature_cols, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.feature_cols = feature_cols
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.tokenizer(
            row["url"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len
        )
        features = row[self.feature_cols].values.astype("float32")
        
        item = {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            "features": torch.tensor(features, dtype=torch.float32)
        }
        if "target" in row:
            item["labels"] = torch.tensor(row["target"], dtype=torch.long)
        return item

In [6]:
# Prepare Data for NN Models
# We need to fit the tokenizer on the training data exactly as in exp_3
char_tokenizer = CharTokenizer(max_len=200)
char_tokenizer.fit(train_df['url'].values)

X_text_train = char_tokenizer.transform(train_df['url'].values)
X_numeric_train = train_w_features_df[numeric_cols].values

# Scale numeric features (using scaler fitted on full train set for simplicity, 
# though strictly should be fold-wise. For OOF generation we should ideally use fold-wise scalers if saved.
# However, exp_3 used a global scaler for the whole dataset in some parts or fold-wise.
# Let's check if we can load pipelines. If not, we approximate with global scaling for OOF generation to save complexity,
# or better, re-fit scaler on training folds.)

# For NN models, we'll re-fit scaler fold-wise to be correct.
pass

## OOF Prediction Generator
This function iterates through the folds, loads the saved model for that fold, and generates predictions for the validation set.

In [None]:
def generate_oof_predictions(experiment_name, model_class=None, model_type='sklearn', feature_type='combined'):
    """
    Generates OOF predictions for a given experiment.
    Checks if OOF file exists first.
    """
    exp_dir = os.path.join(EXPERIMENTS_DIR, experiment_name)
    oof_path = os.path.join(exp_dir, f"{experiment_name}_oof_predictions.csv")
    
    if os.path.exists(oof_path):
        print(f"Loading existing OOF predictions for {experiment_name}...")
        return pd.read_csv(oof_path)
    
    print(f"Generating OOF predictions for {experiment_name}...")
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_preds = np.zeros(len(train_df))
    
    # Prepare data based on feature type
    if feature_type == 'combined':
        X = train_w_features_df.copy()
        # Add URL for text processing if needed by pipeline
        if 'url' not in X.columns:
            X['url'] = train_df['url']
    elif feature_type == 'numeric':
        X = train_w_features_df[numeric_cols].copy()
    elif feature_type == 'text':
        X = train_df['url'].copy()
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        if model_type == 'sklearn':
            model_path = os.path.join(exp_dir, f"pipeline_fold_{fold}.pkl")
            if not os.path.exists(model_path):
                print(f"  Model not found: {model_path}")
                continue
            
            with open(model_path, 'rb') as f:
                model = pickle.load(f)
            
            # Predict
            if hasattr(model, "predict_proba"):
                preds = model.predict_proba(X_val)[:, 1]
            else:
                preds = model.predict(X_val)
            oof_preds[val_idx] = preds
            
        elif model_type == 'pytorch':
            model_path = os.path.join(exp_dir, f"pipeline_fold_{fold}.pkl")
            if not os.path.exists(model_path):
                print(f"  Model not found: {model_path}")
                continue
                
            # Prepare fold data
            # Note: We need to scale numeric features fold-wise to match training
            scaler = StandardScaler()
            X_num_train = scaler.fit_transform(X_numeric_train[train_idx])
            X_num_val = scaler.transform(X_numeric_train[val_idx])
            
            X_txt_val = X_text_train[val_idx]
            
            val_dataset = PhishingDataset(X_num_val, X_txt_val)
            val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
            
            # Init model
            # We need to know params. Assuming standard params from exp_3 for now.
            # Ideally we load params from config.
            if model_class == CharCNN:
                model = CharCNN(vocab_size=char_tokenizer.vocab_size)
            elif model_class == BiLSTM:
                model = BiLSTM(vocab_size=char_tokenizer.vocab_size)
            elif model_class == HybridModel:
                model = HybridModel(vocab_size=char_tokenizer.vocab_size)
            
            model.load_state_dict(torch.load(model_path, weights_only=False))
            
            model.eval()
            
            fold_preds = []
            with torch.no_grad():
                for x_num, x_txt in val_loader:
                    out = model(x_num, x_txt)
                    fold_preds.extend(out.squeeze().numpy())
            oof_preds[val_idx] = fold_preds
            
        elif model_type == 'transformer':
            if LIGHTWEIGHT_MODE:
                print("  Skipping Transformer OOF generation in Lightweight Mode.")
                return None
                
            fold_dir = os.path.join(exp_dir, f"transformer_fold_{fold}")
            if not os.path.exists(fold_dir):
                print(f"  Model dir not found: {fold_dir}")
                continue
            
            # Load model
            # Assuming DebertaWithFeatures
            try:
                model = DebertaWithFeatures.from_pretrained(fold_dir)
            except Exception as e:
                print(f"  Error loading transformer: {e}")
                continue
                
            model.eval()
            if torch.cuda.is_available():
                model.cuda()
            
            # Prepare Data
            tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
            val_ds = URLWithFeaturesDataset(train_df.iloc[val_idx], tokenizer, numeric_cols)
            val_loader = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=lambda x: x) # Custom collate needed?
            # Actually we defined collate_fn in exp_4, let's redefine simple one or use default if dataset returns dicts of tensors
            # The dataset returns dicts, so default collate works if keys match.
            # Wait, default collate stacks. Yes.
            
            fold_preds = []
            with torch.no_grad():
                for batch in tqdm(DataLoader(val_ds, batch_size=16, shuffle=False), desc=f"Fold {fold}"):
                    input_ids = batch['input_ids']
                    attention_mask = batch['attention_mask']
                    features = batch['features']
                    
                    if torch.cuda.is_available():
                        input_ids = input_ids.cuda()
                        attention_mask = attention_mask.cuda()
                        features = features.cuda()
                        
                    logits = model(input_ids, attention_mask, features)
                    probs = torch.softmax(logits, dim=1)[:, 1]
                    fold_preds.extend(probs.cpu().numpy())
            oof_preds[val_idx] = fold_preds

    # Save OOF predictions
    df_oof = pd.DataFrame({'oof_pred': oof_preds})
    df_oof.to_csv(oof_path, index=False)
    print(f"Saved OOF predictions to {oof_path}")
    return df_oof

## Generate OOF Predictions
We will now generate OOF predictions for selected models from each category.

In [10]:
oof_data = pd.DataFrame()
oof_data['target'] = y

# 1. Linear Models
print("--- Linear Models ---")
lr_oof = generate_oof_predictions('exp_1_combined_lr', model_type='sklearn', feature_type='combined')
if lr_oof is not None: oof_data['lr_combined'] = lr_oof['oof_pred']

svm_oof = generate_oof_predictions('exp_1_combined_svm', model_type='sklearn', feature_type='combined')
if svm_oof is not None: oof_data['svm_combined'] = svm_oof['oof_pred']

# 2. Tree Models
print("\n--- Tree Models ---")
# Assuming these exist and follow the pipeline structure
rf_oof = generate_oof_predictions('exp_2_random_forest_all', model_type='sklearn', feature_type='combined')
if rf_oof is not None: oof_data['rf_all'] = rf_oof['oof_pred']

xgb_oof = generate_oof_predictions('exp_2_xgboost_all', model_type='sklearn', feature_type='combined')
if xgb_oof is not None: oof_data['xgb_all'] = xgb_oof['oof_pred']

# 3. Neural Networks
print("\n--- Neural Networks ---")
cnn_oof = generate_oof_predictions('exp_3_charcnn', model_class=CharCNN, model_type='pytorch', feature_type='combined')
if cnn_oof is not None: oof_data['charcnn'] = cnn_oof['oof_pred']

# lstm_oof = generate_oof_predictions('exp_3_bilstm', model_class=BiLSTM, model_type='pytorch', feature_type='combined')
# if lstm_oof is not None: oof_data['bilstm'] = lstm_oof['oof_pred']

# hybrid_oof = generate_oof_predictions('exp_3_hybrid', model_class=HybridModel, model_type='pytorch', feature_type='combined')
# if hybrid_oof is not None: oof_data['hybrid'] = hybrid_oof['oof_pred']

# 4. Transformers
print("\n--- Transformers ---")
if not LIGHTWEIGHT_MODE:
    deberta_oof = generate_oof_predictions('exp_4_deberta_url_features', model_type='transformer', feature_type='combined')
    if deberta_oof is not None: oof_data['deberta'] = deberta_oof['oof_pred']
else:
    print("Skipping DeBERTa (Lightweight Mode)")

--- Linear Models ---
Loading existing OOF predictions for exp_1_combined_lr...
Loading existing OOF predictions for exp_1_combined_svm...

--- Tree Models ---
Loading existing OOF predictions for exp_2_random_forest_all...
Loading existing OOF predictions for exp_2_xgboost_all...

--- Neural Networks ---
Generating OOF predictions for exp_3_charcnn...
  Error loading pytorch model for fold 1: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.
  Error loading pytorch model for fold 2: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.
  Error loading pytorch model for fold 3: Attempting to deserialize object on a CUDA device but torch.cuda.is_a

## Ensemble Strategies
Now that we have OOF predictions, we can combine them.

In [None]:
# Filter out columns that might be missing (if some models failed to load)
pred_cols = [c for c in oof_data.columns if c != 'target']
print(f"Ensembling with models: {pred_cols}")

# Strategy 1: Simple Averaging
oof_data['ensemble_avg'] = oof_data[pred_cols].mean(axis=1)
avg_auc = roc_auc_score(oof_data['target'], oof_data['ensemble_avg'])
print(f"Simple Averaging OOF AUC: {avg_auc:.5f}")

# Strategy 2: Weighted Averaging (Heuristic)
# Give more weight to stronger models (e.g., DeBERTa, XGBoost, SVM)
weights = {}
for col in pred_cols:
    if 'deberta' in col: weights[col] = 3
    elif 'xgb' in col or 'lgbm' in col or 'catboost' in col: weights[col] = 2
    elif 'svm' in col: weights[col] = 1.5
    else: weights[col] = 1

total_weight = sum(weights[c] for c in pred_cols)
oof_data['ensemble_weighted'] = sum(oof_data[c] * weights[c] for c in pred_cols) / total_weight
weighted_auc = roc_auc_score(oof_data['target'], oof_data['ensemble_weighted'])
print(f"Weighted Averaging OOF AUC: {weighted_auc:.5f}")

# Strategy 3: Stacking (Logistic Regression Meta-Learner)
meta_model = LogisticRegression(random_state=RANDOM_STATE)
# We need to cross-validate the meta-model to get a fair OOF score for the stack
# But typically we train the meta-model on the full OOF set and predict on Test.
# To estimate OOF performance of the stack, we can do nested CV or just hold-out.
# Here we'll just fit on the full OOF predictions to see coefficients.
meta_model.fit(oof_data[pred_cols], oof_data['target'])
stack_preds = meta_model.predict_proba(oof_data[pred_cols])[:, 1]
stack_auc = roc_auc_score(oof_data['target'], stack_preds)
print(f"Stacking (LR) Training AUC: {stack_auc:.5f} (Likely optimistic)")

# Check coefficients
coefs = pd.Series(meta_model.coef_[0], index=pred_cols).sort_values(ascending=False)
print("\nStacking Coefficients:")
print(coefs)

## Evaluation on Test Set
To get the final performance, we should ideally load the test set predictions for each model and apply the ensemble weights.
Since we generated OOF predictions, we should assume we can also generate (or have) test predictions.
For this notebook demonstration, we will save the ensemble models (the weights or the meta-model) and the OOF results.

In [None]:
# Save Ensemble Results
ensemble_dir = os.path.join(EXPERIMENTS_DIR, "exp_5_ensemble")
os.makedirs(ensemble_dir, exist_ok=True)

# Save OOF predictions
oof_data.to_csv(os.path.join(ensemble_dir, "ensemble_oof_predictions.csv"), index=False)

# Save Meta Model
with open(os.path.join(ensemble_dir, "stacking_meta_model.pkl"), 'wb') as f:
    pickle.dump(meta_model, f)

# Save Metrics
metrics = {
    "simple_average_auc": avg_auc,
    "weighted_average_auc": weighted_auc,
    "stacking_train_auc": stack_auc,
    "models_used": pred_cols,
    "weights": weights
}
with open(os.path.join(ensemble_dir, "metrics.json"), 'w') as f:
    json.dump(metrics, f, indent=4)

print(f"Ensemble experiment saved to {ensemble_dir}")