In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import ttest_ind
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def load_data(npz_paths: str, data_type: str):
    if data_type not in ["X_mean", "X_max", "X_concat"]:
        raise Exception("data type in valid")

    X_list = []
    y_list = []

    for npz_path in npz_paths:
    
        base = os.path.splitext(os.path.basename(npz_path))[0]      
        csv_path = os.path.join(
            os.path.dirname(npz_path),
            base + "_meta.csv"                                       
        )


        data = np.load(npz_path, allow_pickle=True)
        X = data[data_type]      # (N_docs, 2*D)
        # X_concat = data["X_mean"]
        tids = data["transcriptids"]    


        meta = pd.read_csv(csv_path)

        meta_unique = (
            meta[["transcriptid", "SUESCORE", "label"]]
            .drop_duplicates(subset="transcriptid", keep="first")
            .set_index("transcriptid")
        )

        mask_ids = np.isin(tids, meta_unique.index)
        X_filt = X[mask_ids]
        tids_filt = np.array(tids)[mask_ids]


        lab_df = meta.assign(
            label=lambda df: df.SUESCORE.map(
                lambda s: 1 if s >= 0.5 else (0 if s <= -0.5 else np.nan)
            )
        )
        mask_label = lab_df.label.notna().values
        # apply the same mask in the same order as the CSV, so we use .loc on lab_df
        # but first filter lab_df to only those transcriptids in tids_filt
        Xc, y = X_filt[mask_label], meta.loc[mask_label, "label"].astype(int).values
        
        # now align X and y
        # X_final = X_filt[lab_sub.label.notna()]
        # y_final = lab_sub.label.astype(int).values

        # collect
        X_list.append(Xc)
        y_list.append(y)

    # 2. concatenate all files together
    Xc = np.vstack(X_list)   # shape: (sum_i N_i, 2*D)
    y  = np.concatenate(y_list)  # shape: (sum_i N_i,)

    print("Combined Xc shape:", Xc.shape)
    print("Combined y shape: ", y.shape)

    return Xc, y

In [4]:
def downsample_balance(Xc_unbalanced, y_unbalanced):
    # forced resampling
    idx0 = np.where(y_unbalanced == 0)[0]
    idx1 = np.where(y_unbalanced == 1)[0]

    n = min(len(idx0), len(idx1))

    sel0 = np.random.choice(idx0, size=n, replace=False)
    sel1 = np.random.choice(idx1, size=n, replace=False)

    sel = np.concatenate([sel0, sel1])
    np.random.shuffle(sel)

    # slice out your balanced subset
    Xc_out = Xc_unbalanced[sel]
    y_out = y_unbalanced[sel]

    print("Balanced X shape:", Xc_out.shape)
    print("Balanced y counts:", np.bincount(y_out))
    return Xc_out, y_out

In [5]:
train_npz_paths = [
    "./data/doc_features/transcript_componenttext_2008_1_features.npz",
    "./data/doc_features/transcript_componenttext_2008_2_features.npz",
    "./data/doc_features/transcript_componenttext_2009_1_features.npz",
    "./data/doc_features/transcript_componenttext_2009_2_features.npz",
    "./data/doc_features/transcript_componenttext_2010_1_features.npz",
    "./data/doc_features/transcript_componenttext_2010_2_features.npz",
    "./data/doc_features/transcript_componenttext_2011_1_features.npz",
    "./data/doc_features/transcript_componenttext_2011_2_features.npz",
    "./data/doc_features/transcript_componenttext_2012_1_features.npz",
    "./data/doc_features/transcript_componenttext_2012_2_features.npz",
    "./data/doc_features/transcript_componenttext_2013_1_features.npz",
]

val_npz_paths = [
    "./data/doc_features/transcript_componenttext_2013_2_features.npz",
    "./data/doc_features/transcript_componenttext_2014_1_features.npz",
]

test_npz_paths = [
    "./data/doc_features/transcript_componenttext_2014_2_features.npz",
    "./data/doc_features/transcript_componenttext_2015_1_features.npz",
]


In [6]:
Xc, y = load_data(train_npz_paths, "X_mean")
X_val_all_feat, y_val = load_data(val_npz_paths, "X_mean")
X_test_all_feat, y_test = load_data(test_npz_paths, "X_mean")

Combined Xc shape: (10209, 16384)
Combined y shape:  (10209,)
Combined Xc shape: (2473, 16384)
Combined y shape:  (2473,)
Combined Xc shape: (2508, 16384)
Combined y shape:  (2508,)


In [7]:
# optional downsampling for balancing data
Xc, y = downsample_balance(Xc, y)
X_val_all_feat, y_val = downsample_balance(X_val_all_feat, y_val)
X_test_all_feat, y_test = downsample_balance(X_test_all_feat, y_test)

Balanced X shape: (4526, 16384)
Balanced y counts: [2263 2263]
Balanced X shape: (1108, 16384)
Balanced y counts: [554 554]
Balanced X shape: (1008, 16384)
Balanced y counts: [504 504]


In [17]:


# meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
# mask = meta.label.notna().values
# Xc, y = Xc_aligned[mask], meta.loc[mask, "label"].astype(int).values

D2 = Xc.shape[1]
D = D2 // 2
X_pos, X_neg = Xc[y==1], Xc[y==0]
t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /
                 np.sqrt(X_pos.var(0)/len(X_pos) + X_neg.var(0)/len(X_neg)))

ranked_idx = np.argsort(-t_stats)


for rank, idx in enumerate(ranked_idx[:512], start=1):
    # print(idx)
    part = "mean" if idx < D else "max"
    # print(idx)
    # print(D)
    feat_id = idx if idx < D else idx-D
    t_val   = t_stats[idx]
    print(f"Rank {rank:2d}: {part!r} feature #{feat_id} (t = {t_val:.2f})")



Rank  1: 'max' feature #1053 (t = 10.56)
Rank  2: 'mean' feature #3147 (t = 10.34)
Rank  3: 'max' feature #7429 (t = 10.16)
Rank  4: 'max' feature #2232 (t = 9.96)
Rank  5: 'mean' feature #6470 (t = 9.55)
Rank  6: 'max' feature #4673 (t = 9.49)
Rank  7: 'mean' feature #2775 (t = 9.40)
Rank  8: 'mean' feature #482 (t = 9.35)
Rank  9: 'max' feature #4926 (t = 9.26)
Rank 10: 'max' feature #6178 (t = 9.24)
Rank 11: 'mean' feature #2488 (t = 9.13)
Rank 12: 'mean' feature #5612 (t = 9.00)
Rank 13: 'mean' feature #2366 (t = 8.93)
Rank 14: 'max' feature #4107 (t = 8.88)
Rank 15: 'mean' feature #1520 (t = 8.82)
Rank 16: 'mean' feature #1712 (t = 8.79)
Rank 17: 'max' feature #5685 (t = 8.79)
Rank 18: 'mean' feature #1390 (t = 8.63)
Rank 19: 'mean' feature #2281 (t = 8.57)
Rank 20: 'mean' feature #5553 (t = 8.48)
Rank 21: 'max' feature #1848 (t = 8.46)
Rank 22: 'max' feature #1543 (t = 8.33)
Rank 23: 'max' feature #631 (t = 8.31)
Rank 24: 'max' feature #4471 (t = 8.30)
Rank 25: 'mean' feature #77

  t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /


In [20]:
top_idx = ranked_idx[:2000]
X_test = X_test_all_feat[:, top_idx]
X_val = X_val_all_feat[:, top_idx]
X_top = Xc[:, top_idx]      

X_train = X_top
y_train = y


# X_test = X_test_all_feat[:, :]
# X_val = X_val_all_feat[:, :]
# X_top = Xc[:, :]      

# X_train = X_top
# y_train = y

In [10]:
X_train.shape

(4526, 16384)

In [22]:
# 5) Train with L1 logistic regression & balanced class weights
clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l1",
        solver="saga",
        # class_weight="balanced",
        C=1.0,
        max_iter=2000,
        random_state=42
    )
)
clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

# 6) Evaluate
y_pred   = clf.predict(X_test)
y_probs  = clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_probs))

# 7) Inspect which of your top-1000 actually got nonzero weights
lr = clf.named_steps["logisticregression"]
coefs = lr.coef_.ravel()
nz    = np.where(coefs != 0)[0]





              precision    recall  f1-score   support

           0       0.56      0.56      0.56       504
           1       0.56      0.57      0.56       504

    accuracy                           0.56      1008
   macro avg       0.56      0.56      0.56      1008
weighted avg       0.56      0.56      0.56      1008

ROC AUC: 0.5706294091710759




In [19]:

param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        # solver="saga",    
        solver="liblinear",    
        # class_weight="balanced",
        max_iter=7000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best C (inverse reg. strength): 0.01
CV ROC AUC: 0.669798985855862
              precision    recall  f1-score   support

           0       0.57      0.58      0.58       504
           1       0.57      0.56      0.57       504

    accuracy                           0.57      1008
   macro avg       0.57      0.57      0.57      1008
weighted avg       0.57      0.57      0.57      1008

Test ROC AUC: 0.5867150100781053


In [21]:

param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",    
        # solver="liblinear",    
        # class_weight="balanced",
        max_iter=7000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best C (inverse reg. strength): 0.01
CV ROC AUC: 0.6844055404425369
              precision    recall  f1-score   support

           0       0.59      0.61      0.60       504
           1       0.59      0.58      0.59       504

    accuracy                           0.59      1008
   macro avg       0.59      0.59      0.59      1008
weighted avg       0.59      0.59      0.59      1008

Test ROC AUC: 0.5942263479465861


In [21]:

param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        # solver="saga",    
        solver="liblinear",    
        # class_weight="balanced",
        max_iter=7000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


: 

In [None]:

X_train_t = torch.from_numpy(X_train).float().to(device)
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1).to(device)
X_test_t  = torch.from_numpy(X_test).float().to(device)
y_test_t  = torch.from_numpy(y_test).float().unsqueeze(1).to(device)

# DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True)


class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_top.shape[1]).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCELoss()

# 4) Training loop
n_epochs = 30
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(train_dl.dataset)
    print(f"Epoch {epoch:2d}: train loss = {avg_loss:.4f}")

# 5) Evaluation
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

Epoch  1: train loss = 0.4883
Epoch  2: train loss = 0.4326
Epoch  3: train loss = 0.4096
Epoch  4: train loss = 0.3909
Epoch  5: train loss = 0.3716
Epoch  6: train loss = 0.3501
Epoch  7: train loss = 0.3382
Epoch  8: train loss = 0.3268
Epoch  9: train loss = 0.3035
Epoch 10: train loss = 0.2902
Epoch 11: train loss = 0.2670
Epoch 12: train loss = 0.2486
Epoch 13: train loss = 0.2466
Epoch 14: train loss = 0.2280
Epoch 15: train loss = 0.2109
Epoch 16: train loss = 0.1982
Epoch 17: train loss = 0.1960
Epoch 18: train loss = 0.1869
Epoch 19: train loss = 0.1776
Epoch 20: train loss = 0.1740
Epoch 21: train loss = 0.1530
Epoch 22: train loss = 0.1395
Epoch 23: train loss = 0.1286
Epoch 24: train loss = 0.1288
Epoch 25: train loss = 0.1171
Epoch 26: train loss = 0.1268
Epoch 27: train loss = 0.1193
Epoch 28: train loss = 0.1028
Epoch 29: train loss = 0.1005
Epoch 30: train loss = 0.1140

Classification Report:
              precision    recall  f1-score   support

           0       0.

: 

In [25]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)


scale_pos_weight = float((y_train == 0).sum()) / (y_train == 1).sum()


params = {
    "objective":        "binary:logistic",
    "eval_metric":      "auc",
    "scale_pos_weight": scale_pos_weight,
    "tree_method":      "hist",       
    "grow_policy":      "lossguide",  
    "max_depth":        6,
    "learning_rate":    0.1,
    "subsample":        0.8,
    "colsample_bytree": 0.8,
    "random_state":     42,
    "verbosity":        1
}


cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=20,
    metrics="auc",
    seed=42,
    as_pandas=True,
    verbose_eval=50
)
best_rounds = len(cv_results)
print(f"Optimal boosting rounds: {best_rounds}")


bst = xgb.train(
    params,
    dtrain,
    num_boost_round=best_rounds
)


y_prob = bst.predict(dtest)
y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

[0]	train-auc:0.79091+0.01652	test-auc:0.58206+0.02945
[50]	train-auc:1.00000+0.00000	test-auc:0.69692+0.02358
[100]	train-auc:1.00000+0.00000	test-auc:0.70410+0.02146
[150]	train-auc:1.00000+0.00000	test-auc:0.71121+0.01732
[200]	train-auc:1.00000+0.00000	test-auc:0.71543+0.01728
[206]	train-auc:1.00000+0.00000	test-auc:0.71560+0.01736
Optimal boosting rounds: 188

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.18      0.22       222
           1       0.79      0.89      0.84       793

    accuracy                           0.73      1015
   macro avg       0.55      0.53      0.53      1015
weighted avg       0.69      0.73      0.70      1015

Test ROC AUC: 0.5895618190700158


In [16]:
# 3) Convert to tensors and move to device
def to_tensor(x, y):
    xt = torch.from_numpy(x).float().to(device)
    yt = torch.from_numpy(y).float().unsqueeze(1).to(device)
    return xt, yt

X_tr_t, y_tr_t = to_tensor(X_train, y_train)
X_val_t, y_val_t = to_tensor(X_val, y_val)
X_test_t, y_test_t = to_tensor(X_test, y_test)

# 4) DataLoaders
batch_size = 8
train_ds = TensorDataset(X_tr_t, y_tr_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

# 5) Model definition
class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_train.shape[1]).to(device)

# 6) Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=1e-5)
criterion = nn.BCELoss()

# 7) Training with validation
n_epochs = 100
for epoch in range(1, n_epochs + 1):
    # Training
    model.train()
    total_tr_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_tr_loss += loss.item() * xb.size(0)
    avg_tr_loss = total_tr_loss / len(train_dl.dataset)

    # Validation
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_dl:
            pred = model(xb)
            loss = criterion(pred, yb)
            total_val_loss += loss.item() * xb.size(0)
    avg_val_loss = total_val_loss / len(val_dl.dataset)

    print(f"Epoch {epoch:2d}: train_loss = {avg_tr_loss:.4f}, val_loss = {avg_val_loss:.4f}")

# 8) Final evaluation on test set
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))


Epoch  1: train_loss = 0.6948, val_loss = 0.6883
Epoch  2: train_loss = 0.6825, val_loss = 0.6822
Epoch  3: train_loss = 0.6719, val_loss = 0.6773
Epoch  4: train_loss = 0.6742, val_loss = 0.6804
Epoch  5: train_loss = 0.6629, val_loss = 0.6770
Epoch  6: train_loss = 0.6625, val_loss = 0.6746
Epoch  7: train_loss = 0.6618, val_loss = 0.6742
Epoch  8: train_loss = 0.6602, val_loss = 0.6722
Epoch  9: train_loss = 0.6564, val_loss = 0.6716
Epoch 10: train_loss = 0.6513, val_loss = 0.6738
Epoch 11: train_loss = 0.6584, val_loss = 0.6708
Epoch 12: train_loss = 0.6521, val_loss = 0.6695
Epoch 13: train_loss = 0.6503, val_loss = 0.6709
Epoch 14: train_loss = 0.6491, val_loss = 0.6687
Epoch 15: train_loss = 0.6484, val_loss = 0.6727
Epoch 16: train_loss = 0.6481, val_loss = 0.6679
Epoch 17: train_loss = 0.6476, val_loss = 0.6667
Epoch 18: train_loss = 0.6468, val_loss = 0.6673
Epoch 19: train_loss = 0.6438, val_loss = 0.6681
Epoch 20: train_loss = 0.6462, val_loss = 0.6681
Epoch 21: train_loss

In [108]:
# 3) Convert to tensors and move to device
def to_tensor(x, y):
    xt = torch.from_numpy(x).float().to(device)
    yt = torch.from_numpy(y).float().unsqueeze(1).to(device)
    return xt, yt

X_tr_t, y_tr_t = to_tensor(X_train, y_train)
X_val_t, y_val_t = to_tensor(X_val, y_val)
X_test_t, y_test_t = to_tensor(X_test, y_test)

# 4) DataLoaders
batch_size = 8
train_ds = TensorDataset(X_tr_t, y_tr_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)


pos = np.sum(y_train == 1)
neg = np.sum(y_train == 0)
pos_weight = torch.tensor(neg / pos, dtype=torch.float).to(device)


# 5) Model definition
class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_train.shape[1]).to(device)

# 6) Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-5)
# criterion = nn.BCELoss()
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)


# 7) Training with validation
n_epochs = 120
for epoch in range(1, n_epochs + 1):
    # Training
    model.train()
    total_tr_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_tr_loss += loss.item() * xb.size(0)
    avg_tr_loss = total_tr_loss / len(train_dl.dataset)

    # Validation
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_dl:
            pred = model(xb)
            loss = criterion(pred, yb)
            total_val_loss += loss.item() * xb.size(0)
    avg_val_loss = total_val_loss / len(val_dl.dataset)

    print(f"Epoch {epoch:2d}: train_loss = {avg_tr_loss:.4f}, val_loss = {avg_val_loss:.4f}")

# 8) Final evaluation on test set
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

Epoch  1: train_loss = 0.2867, val_loss = 0.2971
Epoch  2: train_loss = 0.2781, val_loss = 0.2948
Epoch  3: train_loss = 0.2746, val_loss = 0.2948
Epoch  4: train_loss = 0.2709, val_loss = 0.2941
Epoch  5: train_loss = 0.2699, val_loss = 0.2941
Epoch  6: train_loss = 0.2676, val_loss = 0.2926
Epoch  7: train_loss = 0.2661, val_loss = 0.2921
Epoch  8: train_loss = 0.2664, val_loss = 0.2928
Epoch  9: train_loss = 0.2647, val_loss = 0.2922
Epoch 10: train_loss = 0.2639, val_loss = 0.2918
Epoch 11: train_loss = 0.2616, val_loss = 0.2929
Epoch 12: train_loss = 0.2608, val_loss = 0.2946
Epoch 13: train_loss = 0.2593, val_loss = 0.2928
Epoch 14: train_loss = 0.2595, val_loss = 0.2940
Epoch 15: train_loss = 0.2578, val_loss = 0.2954
Epoch 16: train_loss = 0.2572, val_loss = 0.2954
Epoch 17: train_loss = 0.2563, val_loss = 0.2979
Epoch 18: train_loss = 0.2554, val_loss = 0.2972
Epoch 19: train_loss = 0.2561, val_loss = 0.2954
Epoch 20: train_loss = 0.2542, val_loss = 0.2954
Epoch 21: train_loss

KeyboardInterrupt: 

In [15]:
# 3 layer mlp

def to_tensor_dataset(X, y):
    Xt = torch.from_numpy(X).float().to(device)
    yt = torch.from_numpy(y).float().unsqueeze(1).to(device)
    return TensorDataset(Xt, yt)

train_ds = to_tensor_dataset(X_train, y_train)
val_ds   = to_tensor_dataset(X_val, y_val)
test_ds  = to_tensor_dataset(X_test, y_test)

batch_size = 32
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

# 3) Define the 3-layer MLP
class ThreeLayerMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = ThreeLayerMLP(input_dim=X_train.shape[1]).to(device)

# 4) Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=1e-5)
criterion = nn.BCELoss()

# 5) Training & Validation Loop
n_epochs = 300
best_val_loss = float('inf')

for epoch in range(1, n_epochs+1):
    # -- Training
    model.train()
    train_loss = 0.0
    for xb, yb in train_dl:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_dl.dataset)

    # -- Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(val_dl.dataset)

    print(f"Epoch {epoch:2d}  Train Loss: {train_loss:.4f}  Val Loss: {val_loss:.4f}")

    # Optional: save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")

# 6) Load best model and test evaluation
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

y_probs = []
y_true  = []
with torch.no_grad():
    for xb, yb in test_dl:
        probs = model(xb)
        y_probs.extend(probs.cpu().numpy().flatten().tolist())
        y_true .extend(yb.cpu().numpy().flatten().tolist())

y_pred = (np.array(y_probs) >= 0.5).astype(int)

print("\nTest Classification Report:")
print(classification_report(y_true, y_pred))
print("Test ROC AUC:", roc_auc_score(y_true, y_probs))


Epoch  1  Train Loss: 0.7216  Val Loss: 0.6941
Epoch  2  Train Loss: 0.7179  Val Loss: 0.6926
Epoch  3  Train Loss: 0.7176  Val Loss: 0.6924
Epoch  4  Train Loss: 0.7203  Val Loss: 0.6922
Epoch  5  Train Loss: 0.7219  Val Loss: 0.6922
Epoch  6  Train Loss: 0.7187  Val Loss: 0.6913
Epoch  7  Train Loss: 0.7173  Val Loss: 0.6910
Epoch  8  Train Loss: 0.7154  Val Loss: 0.6907
Epoch  9  Train Loss: 0.7208  Val Loss: 0.6906
Epoch 10  Train Loss: 0.7191  Val Loss: 0.6903
Epoch 11  Train Loss: 0.7139  Val Loss: 0.6901
Epoch 12  Train Loss: 0.7100  Val Loss: 0.6900
Epoch 13  Train Loss: 0.7194  Val Loss: 0.6886
Epoch 14  Train Loss: 0.7128  Val Loss: 0.6889
Epoch 15  Train Loss: 0.7158  Val Loss: 0.6890
Epoch 16  Train Loss: 0.7144  Val Loss: 0.6886
Epoch 17  Train Loss: 0.7136  Val Loss: 0.6879
Epoch 18  Train Loss: 0.7173  Val Loss: 0.6879
Epoch 19  Train Loss: 0.7124  Val Loss: 0.6867
Epoch 20  Train Loss: 0.7065  Val Loss: 0.6871
Epoch 21  Train Loss: 0.7079  Val Loss: 0.6880
Epoch 22  Tra