# Imports #

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import copy
from imblearn.over_sampling import SMOTE
from collections import Counter
from torchvision.ops import sigmoid_focal_loss
from Numeric-Features-model.MLPClassifier import MLPClassifier


SyntaxError: invalid syntax (781908333.py, line 13)

# Functions #

In [3]:
# === Convert to Torch Tensors ===
def to_tensor(x, y):
    return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


def find_best_threshold(model, val_loader, device='cuda',beta=0.6):
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            logits = model(X_batch)
            probs = torch.sigmoid(logits)
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)

    thresholds = np.linspace(0.35, 0.99, 99)
    best_score = -np.inf
    best_threshold = 0.5

    for t in thresholds:
        preds = (all_probs > t).astype(int)
        tp = np.sum((preds == 1) & (all_labels == 1))
        fp = np.sum((preds == 1) & (all_labels == 0))
        fn = np.sum((preds == 0) & (all_labels == 1))

        precision = tp / (tp + fp + 1e-9)
        recall = tp / (tp + fn + 1e-9)
        score = beta * precision + (1 - beta) * recall  # Custom blend

        if score > best_score:
            best_score = score
            best_threshold = t

    print(f"✅ Best Threshold: {best_threshold:.3f} with Precision-Recall Weighted Score: {best_score:.4f}")
    return best_threshold




# === Training Function ===
def train_model(model, train_loader, val_loader,
                device='cuda',
                max_epochs=500,
                patience=10,
                learning_rate=0.001,
                weight_decay=1e-5,
                focal_alpha=0.5,
                focal_gamma=2.0):
    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=weight_decay
    )

    model.to(device)
    best_model = None
    best_val_loss = float('inf')
    no_improve_epochs = 0

    for epoch in range(max_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            logits = model(X_batch)
            loss = sigmoid_focal_loss(
                inputs=logits,
                targets=y_batch,
                alpha=focal_alpha,
                gamma=focal_gamma,
                reduction='mean'
            )
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            for X_val, y_val in val_loader:
                X_val, y_val = X_val.to(device), y_val.to(device)
                val_logits = model(X_val)
                val_loss = sigmoid_focal_loss(
                    inputs=val_logits,
                    targets=y_val,
                    alpha=focal_alpha,
                    gamma=focal_gamma,
                    reduction='mean'
                )

        print(f"Epoch {epoch+1}/{max_epochs} - Val Loss: {val_loss.item():.4f}")

        # Early stopping
        if val_loss.item() < best_val_loss - 1e-4:
            best_val_loss = val_loss.item()
            best_model = copy.deepcopy(model.state_dict())
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    # Load best model
    model.load_state_dict(best_model)
    return model


def evaluate_model(model, data_loader, device='cuda', threshold=0.5, plot_roc=True, label="Test"):
    model.eval()
    all_preds = []
    all_labels = []
    all_logits = []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            logits = model(X_batch)
            probs = torch.sigmoid(logits)

            all_logits.append(logits.cpu())
            all_preds.append(probs.cpu())
            all_labels.append(y_batch.cpu())

    # Concatenate batches
    y_true = torch.cat(all_labels).numpy()
    y_probs = torch.cat(all_preds).numpy()
    y_logits = torch.cat(all_logits).numpy()
    y_pred = (y_probs > threshold).astype(int)

    # === Metrics ===
    from sklearn.metrics import (
        confusion_matrix, classification_report,
        roc_auc_score, roc_curve, log_loss
    )

    print(f"\n📊 Evaluation on: {label}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))

    roc_auc = roc_auc_score(y_true, y_probs)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # === Loss & Generalization Error ===
    bce_loss = nn.BCEWithLogitsLoss()
    logits_tensor = torch.tensor(y_logits, dtype=torch.float32)
    targets_tensor = torch.tensor(y_true, dtype=torch.float32)
    loss = bce_loss(logits_tensor, targets_tensor).item()
    print(f"{label} BCE Loss: {loss:.6f}")

    return loss

# Data games #

In [4]:
# Separate features and labels
df = pd.read_csv("../data/labeled_intersection.csv")

X = df.drop(columns=['userid', 'label']).values
y = df['label'].values

# === Scale Features ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Split the Data ===
SEED = 42
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=SEED # stratify ensures same proportion class balance in splits
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=SEED
)

original_counts = Counter(y_train)  # This is before SMOTE
num_neg = original_counts[0]
num_pos = original_counts[1]
pos_weight = torch.tensor([num_neg / num_pos], dtype=torch.float32)
print(f"Using pos_weight: {pos_weight.item():.4f}")

# === Apply SMOTE *only* on training set ===
smote = SMOTE(random_state=SEED)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


Using pos_weight: 3.1111


## Tensors ##

In [5]:
# === Convert df's to Tensors ===
X_train_tensor, y_train_tensor = to_tensor(X_train_resampled, y_train_resampled)
X_val_tensor, y_val_tensor = to_tensor(X_val, y_val)
X_test_tensor, y_test_tensor = to_tensor(X_test, y_test)

# === TensorDatasets ===
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# === DataLoaders ===
USE_FULL_BATCH = False  # Change to True if you want 1-batch training

BATCH_SIZE = (
    len(train_dataset) if USE_FULL_BATCH else 128
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

# Model training #

In [6]:
model = MLPClassifier(input_dim=X_train_tensor.shape[1])
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    focal_alpha=0.5
)
best_threshold = find_best_threshold(trained_model, val_loader, beta=0.5)

train_loss = evaluate_model(trained_model, train_loader,threshold=best_threshold, device='cuda', label="Train")
test_loss = evaluate_model(trained_model, test_loader, threshold=best_threshold, device='cuda', label="Test")

print(f"\n🧠 Generalization Gap (|Train - Test|): {abs(train_loss - test_loss):.6f}")


Epoch 1/500 - Val Loss: 0.0595
Epoch 2/500 - Val Loss: 0.0597
Epoch 3/500 - Val Loss: 0.0572
Epoch 4/500 - Val Loss: 0.0587
Epoch 5/500 - Val Loss: 0.0563
Epoch 6/500 - Val Loss: 0.0553
Epoch 7/500 - Val Loss: 0.0584
Epoch 8/500 - Val Loss: 0.0528
Epoch 9/500 - Val Loss: 0.0552
Epoch 10/500 - Val Loss: 0.0556
Epoch 11/500 - Val Loss: 0.0562
Epoch 12/500 - Val Loss: 0.0540
Epoch 13/500 - Val Loss: 0.0551
Epoch 14/500 - Val Loss: 0.0542
Epoch 15/500 - Val Loss: 0.0557
Epoch 16/500 - Val Loss: 0.0552
Epoch 17/500 - Val Loss: 0.0547
Epoch 18/500 - Val Loss: 0.0572
Early stopping triggered at epoch 18
✅ Best Threshold: 0.487 with Precision-Recall Weighted Score: 0.6972

📊 Evaluation on: Train
Confusion Matrix:
[[60055  8440]
 [18047 50448]]

Classification Report:
              precision    recall  f1-score   support

         0.0     0.7689    0.8768    0.8193     68495
         1.0     0.8567    0.7365    0.7921     68495

    accuracy                         0.8067    136990
   macro avg

# Save model #

In [8]:
torch.save(model.state_dict(), "trained-model/mlp_model.pt")
