In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_fscore_support
import numpy as np
import pandas as pd
from data_utils import load_data, select_and_clean, encode_and_split

DATA_PATH = './data/accepted_2007_to_2018.csv'
df = load_data(DATA_PATH, nrows=200000)
proc = select_and_clean(df)
X_train, X_test, y_train, y_test, scaler = encode_and_split(proc, test_size=0.2)

# Convert to torch tensors
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = MLP(X_train.shape[1])
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()

# Training loop (minimal)
for epoch in range(5):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        opt.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        opt.step()
        total_loss += loss.item() * xb.size(0)
    print(f'Epoch {epoch+1} loss = {total_loss/len(train_ds):.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    probs = model(X_test_t).numpy().ravel()
    preds = (probs >= 0.5).astype(int)
    auc = roc_auc_score(y_test.values, probs)
    f1 = f1_score(y_test.values, preds)
    print('AUC:', auc)
    print('F1:', f1)



In [None]:
# Load the grid-search best model and evaluate with threshold sweep
import json
from pathlib import Path
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score

PROJECT_ROOT = Path('.') / ''
MODEL_DIR = Path('models')
DATA_DIR = Path('data')
metrics_path = MODEL_DIR / 'grid_best_metrics.json'
state_path = MODEL_DIR / 'mlp_grid_best.pth'
assert metrics_path.exists(), 'grid_best_metrics.json not found. Run grid search first.'
best = json.loads(metrics_path.read_text())
cfg = best['best_config'] if 'best_config' in best else best.get('config', {})
hidden = tuple(cfg.get('hidden_dims', [256,128]))
dropout = cfg.get('dropout', 0.2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load test set
if (DATA_DIR / 'test.parquet').exists():
    df_test = pd.read_parquet(DATA_DIR / 'test.parquet')
else:
    df_test = pd.read_parquet(DATA_DIR / 'processed_sample.parquet')
feat_cols = [c for c in df_test.columns if c != 'target']
X_test = df_test[feat_cols].to_numpy(dtype=np.float32)
y_test = df_test['target'].to_numpy(dtype=np.int32)

class MLPGrid(nn.Module):
    def __init__(self, input_dim, hidden_dims=(256,128), dropout=0.2):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x).squeeze(-1)

model = MLPGrid(input_dim=len(feat_cols), hidden_dims=hidden, dropout=dropout).to(device)
if state_path.exists():
    model.load_state_dict(torch.load(state_path, map_location=device))
else:
    raise FileNotFoundError('Saved model not found at ' + str(state_path))
model.eval()
with torch.no_grad():
    Xb = torch.tensor(X_test, dtype=torch.float32).to(device)
    logits = model(Xb).cpu().numpy()
    probs = 1.0 / (1.0 + np.exp(-logits))

# compute AUC and sweep threshold for best F1 on test
auc = roc_auc_score(y_test, probs)
thresholds = np.linspace(0.01, 0.99, 99)
best_f1 = -1.0
best_thr = 0.5
for thr in thresholds:
    preds = (probs >= thr).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1 = f1; best_thr = float(thr)

print('Loaded best config:', cfg)
print(f'Test AUC: {auc:.4f}, Best test F1: {best_f1:.4f} at threshold={best_thr:.3f}')

# save final metrics
out = {'test_auc': float(auc), 'test_f1': float(best_f1), 'best_threshold': best_thr, 'config': cfg}
(MODEL_DIR / 'final_best_metrics.json').write_text(json.dumps(out, indent=2))
print('Saved final metrics to', MODEL_DIR / 'final_best_metrics.json')