# Imports #

In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report,
    roc_auc_score, roc_curve
)
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import copy
from imblearn.over_sampling import SMOTE
from collections import Counter
from torchvision.ops import sigmoid_focal_loss


# Classes #

In [82]:
# === Model Definition ===
class MaskedDataset(torch.utils.data.Dataset):
    def __init__(self, X, mask, y):
        self.X = X
        self.mask = mask
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.mask[idx], self.y[idx]

class MaskedMLP(nn.Module):
    def __init__(self, input_dim):
        super(MaskedMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # No sigmoid here (we use BCEWithLogitsLoss)
        )

    def forward(self, x, mask=None):
        if mask is not None:
            x = x * mask  # Apply per-sample feature masking
        return self.net(x)

# Functions #

In [83]:
# === Convert to Torch Tensors ===
def to_tensor(x, y=None, dtype=torch.float32):
    x_tensor = torch.tensor(x, dtype=dtype)
    if y is not None:
        y_tensor = torch.tensor(y, dtype=dtype)
        return x_tensor, y_tensor
    return x_tensor



def find_best_threshold(model, val_loader, device='cuda', beta=0.65):
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for X_batch, mask_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            mask_batch = mask_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch, mask_batch).squeeze(1)
            probs = torch.sigmoid(logits)

            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)

    thresholds = np.linspace(0.35, 0.99, 99)
    best_score = -np.inf
    best_threshold = 0.5

    for t in thresholds:
        preds = (all_probs > t).astype(int)
        tp = np.sum((preds == 1) & (all_labels == 1))
        fp = np.sum((preds == 1) & (all_labels == 0))
        fn = np.sum((preds == 0) & (all_labels == 1))

        precision = tp / (tp + fp + 1e-9)
        recall = tp / (tp + fn + 1e-9)
        score = beta * precision + (1 - beta) * recall  # Custom blend

        if score > best_score:
            best_score = score
            best_threshold = t

    print(f"✅ Best Threshold: {best_threshold:.3f} with Precision-Recall Weighted Score: {best_score:.4f}")
    return best_threshold



# === Training Function ===
def train_model(model, train_loader, val_loader, criterion, optimizer, device='cuda',
                num_epochs=500, patience=10):
    model = model.to(device)
    best_val_loss = float('inf')
    best_model_state = None
    trigger_times = 0
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for X_batch, mask_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            mask_batch = mask_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch, mask_batch).squeeze(1)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * X_batch.size(0)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_val, mask_val, y_val in val_loader:
                X_val = X_val.to(device)
                mask_val = mask_val.to(device)
                y_val = y_val.to(device)

                outputs = model(X_val, mask_val).squeeze(1)
                loss = criterion(outputs, y_val)
                val_loss += loss.item() * X_val.size(0)

        epoch_val_loss = val_loss / len(val_loader.dataset)
        val_losses.append(epoch_val_loss)

        print(f"Epoch {epoch + 1}/{num_epochs} - Val Loss: {epoch_val_loss:.4f}")

        # Early stopping
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = model.state_dict()
            trigger_times = 0
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    model.load_state_dict(best_model_state)
    return model


def evaluate_model(model, data_loader, device='cuda', threshold=0.5, plot_roc=True, label="Test"):
    model.eval()
    all_preds = []
    all_labels = []
    all_logits = []

    with torch.no_grad():
        for X_batch, mask_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            mask_batch = mask_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch, mask_batch).squeeze(1)
            probs = torch.sigmoid(logits)

            all_logits.append(logits.cpu())
            all_preds.append(probs.cpu())
            all_labels.append(y_batch.cpu())

    # Concatenate all batches
    y_true = torch.cat(all_labels).numpy()
    y_probs = torch.cat(all_preds).numpy()
    y_logits = torch.cat(all_logits).numpy()
    y_pred = (y_probs > threshold).astype(int)

    # === Metrics ===
    print(f"\n📊 Evaluation on: {label}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))

    roc_auc = roc_auc_score(y_true, y_probs)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # === BCE Loss & Generalization Reference ===
    bce_loss = nn.BCEWithLogitsLoss()
    logits_tensor = torch.tensor(y_logits, dtype=torch.float32)
    targets_tensor = torch.tensor(y_true, dtype=torch.float32)
    loss = bce_loss(logits_tensor, targets_tensor).item()
    print(f"{label} BCE Loss: {loss:.6f}")

    return loss

# Data games #

In [84]:
# === 1. Load Datasets ===
df_full = pd.read_csv("../data/full_features.csv")
df_partial = pd.read_csv("../data/partial_features_bots.csv")

# === 2. Identify Feature Columns ===
feature_cols = [col for col in df_full.columns if col not in ['userid', 'label']]
mask_cols = [col for col in feature_cols if col not in ['followers', 'avg_retweetcount']]

# === 3. Create Masks ===
# Full dataset: all features present
mask_full = np.ones_like(df_full[feature_cols].values, dtype=np.float32)

# Partial dataset: mask everything except 'followers' and 'avg_retweetcount'
mask_partial = np.ones_like(df_partial[feature_cols].values, dtype=np.float32)
mask_col_indices = [df_partial.columns.get_loc(col) - 1 for col in mask_cols]  # -1 for userid exclusion
for col_idx in mask_col_indices:
    mask_partial[:, col_idx] = 0.0

# === 4. Scale Features ===
scaler = StandardScaler()
X_full = scaler.fit_transform(df_full[feature_cols].values)
X_partial = scaler.transform(df_partial[feature_cols].values)

# Labels
y_full = df_full['label'].values
y_partial = df_partial['label'].values

# === 5. Split Partial Into Train/Val ===
X_partial_train, X_partial_val, y_partial_train, y_partial_val, mask_partial_train, mask_partial_val = train_test_split(
    X_partial, y_partial, mask_partial, test_size=0.5, stratify=y_partial, random_state=42
)
# === Split Full Data (25% test, 75% train+val) ===
X_full_trainval, X_full_test, y_full_trainval, y_full_test, mask_full_trainval, mask_full_test = train_test_split(
    X_full, y_full, mask_full, test_size=0.25, stratify=y_full, random_state=42
)
# === Split Train/Val from the 75% remaining full-feature set ===
X_full_train, X_full_val, y_full_train, y_full_val, mask_full_train, mask_full_val = train_test_split(
    X_full_trainval, y_full_trainval, mask_full_trainval, test_size=0.5, stratify=y_full_trainval, random_state=42
)

# === 6. Combine Full with Partial Splits ===
X_train = np.vstack([X_full_train, X_partial_train])
y_train = np.concatenate([y_full_train, y_partial_train])
mask_train = np.vstack([mask_full_train, mask_partial_train])

X_val = np.vstack([X_full_val, X_partial_val])
y_val = np.concatenate([y_full_val, y_partial_val])
mask_val = np.vstack([mask_full_val, mask_partial_val])

X_test = X_full_test
y_test = y_full_test
mask_test = mask_full_test

# # === Apply SMOTE *only* on training set ===
# smote = SMOTE(random_state=SEED)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


## Tensors ##

In [85]:

# === Convert df's to Tensors ===
X_train_tensor, y_train_tensor = to_tensor(X_train, y_train)
X_val_tensor, y_val_tensor = to_tensor(X_val, y_val)
X_test_tensor, y_test_tensor = to_tensor(X_test, y_test)

mask_train_tensor = to_tensor(mask_train)
mask_val_tensor = to_tensor(mask_val)
mask_test_tensor = to_tensor(mask_test)

# === Wrap in datasets ===
train_dataset = MaskedDataset(X_train_tensor, mask_train_tensor, y_train_tensor)
val_dataset   = MaskedDataset(X_val_tensor, mask_val_tensor, y_val_tensor)
test_dataset  = MaskedDataset(X_test_tensor, mask_test_tensor, y_test_tensor)

# === Define loaders ===
USE_FULL_BATCH = False  # Set to True if you want full-batch training

BATCH_SIZE = len(train_dataset) if USE_FULL_BATCH else 128

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

# Model training #

In [86]:
# model = MLPClassifier(input_dim=X_train_tensor.shape[1])
# trained_model = train_model(
#     model=model,
#     train_loader=train_loader,
#     val_loader=val_loader,
#     pos_weight=pos_weight,
#     device='cuda'
# )
model = MaskedMLP(input_dim=X_train_tensor.shape[1])
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    device='cuda',
    criterion = torch.nn.BCEWithLogitsLoss(),
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0)
)
best_threshold = find_best_threshold(trained_model, val_loader, beta=0.5)

train_loss = evaluate_model(trained_model, train_loader,threshold=best_threshold, device='cuda', label="Train")
test_loss = evaluate_model(trained_model, test_loader, threshold=best_threshold, device='cuda', label="Test")

print(f"\n🧠 Generalization Gap (|Train - Test|): {abs(train_loss - test_loss):.6f}")


Epoch 1/500 - Val Loss: 0.1679
Epoch 2/500 - Val Loss: 0.1591
Epoch 3/500 - Val Loss: 0.1577
Epoch 4/500 - Val Loss: 0.1547
Epoch 5/500 - Val Loss: 0.1551
Epoch 6/500 - Val Loss: 0.1530
Epoch 7/500 - Val Loss: 0.1538
Epoch 8/500 - Val Loss: 0.1583
Epoch 9/500 - Val Loss: 0.1523
Epoch 10/500 - Val Loss: 0.1524
Epoch 11/500 - Val Loss: 0.1516
Epoch 12/500 - Val Loss: 0.1503
Epoch 13/500 - Val Loss: 0.1507
Epoch 14/500 - Val Loss: 0.1513
Epoch 15/500 - Val Loss: 0.1510
Epoch 16/500 - Val Loss: 0.1510
Epoch 17/500 - Val Loss: 0.1486
Epoch 18/500 - Val Loss: 0.1482
Epoch 19/500 - Val Loss: 0.1504
Epoch 20/500 - Val Loss: 0.1476
Epoch 21/500 - Val Loss: 0.1474
Epoch 22/500 - Val Loss: 0.1490
Epoch 23/500 - Val Loss: 0.1476
Epoch 24/500 - Val Loss: 0.1476
Epoch 25/500 - Val Loss: 0.1484
Epoch 26/500 - Val Loss: 0.1470
Epoch 27/500 - Val Loss: 0.1514
Epoch 28/500 - Val Loss: 0.1467
Epoch 29/500 - Val Loss: 0.1487
Epoch 30/500 - Val Loss: 0.1474
Epoch 31/500 - Val Loss: 0.1472
Epoch 32/500 - Va