In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from scipy import sparse
from scipy.stats import ttest_ind
from sklearn.datasets import dump_svmlight_file
from sklearn.feature_selection import (
    SelectKBest,
    f_classif,
    mutual_info_classif,
    SelectFromModel,
    RFE
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
import xgboost as xgb


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def load_data(npz_paths: str, data_type: str, threshold=0.5, center=0):
    upper_threshold = threshold + center
    lower_threshold = -threshold + center
    if data_type not in ["X_mean", "X_max", "X_concat"]:
        raise Exception("data type in valid")

    X_list = []
    y_list = []

    for npz_path in npz_paths:
    
        base = os.path.splitext(os.path.basename(npz_path))[0]      
        csv_path = os.path.join(
            os.path.dirname(npz_path),
            base + "_meta.csv"                                       
        )


        data = np.load(npz_path, allow_pickle=True)
        X = data[data_type]      # (N_docs, 2*D)
        # X_concat = data["X_mean"]
        tids = data["transcriptids"]    


        meta = pd.read_csv(csv_path)

        meta_unique = (
            meta[["transcriptid", "SUESCORE", "label"]]
            .drop_duplicates(subset="transcriptid", keep="first")
            .set_index("transcriptid")
        )

        mask_ids = np.isin(tids, meta_unique.index)
        X_filt = X[mask_ids]
        tids_filt = np.array(tids)[mask_ids]


        lab_df = meta.assign(
            label=lambda df: df.SUESCORE.map(
                lambda s: 1 if s >= upper_threshold else (0 if s <= lower_threshold else np.nan)
            )
        )

        # meta['label'] = np.nan

        # set label = 1 where SUESCORE > threshold
        meta.loc[meta['SUESCORE'] >= upper_threshold, 'label'] = 1

        # set label = 0 where SUESCORE < -threshold
        meta.loc[meta['SUESCORE'] <= lower_threshold, 'label'] = 0

        mask_label = lab_df.label.notna().values
        # apply the same mask in the same order as the CSV, so we use .loc on lab_df
        # but first filter lab_df to only those transcriptids in tids_filt
        Xc, y = X_filt[mask_label], meta.loc[mask_label, "label"].astype(int).values
        
        # now align X and y
        # X_final = X_filt[lab_sub.label.notna()]
        # y_final = lab_sub.label.astype(int).values

        # collect
        X_list.append(Xc)
        y_list.append(y)

    # 2. concatenate all files together
    Xc = np.vstack(X_list)   # shape: (sum_i N_i, 2*D)
    y  = np.concatenate(y_list)  # shape: (sum_i N_i,)

    print("Combined Xc shape:", Xc.shape)
    print("Combined y shape: ", y.shape)

    return Xc, y

In [4]:
def downsample_balance(Xc_unbalanced, y_unbalanced):
    # forced resampling
    idx0 = np.where(y_unbalanced == 0)[0]
    idx1 = np.where(y_unbalanced == 1)[0]

    n = min(len(idx0), len(idx1))

    sel0 = np.random.choice(idx0, size=n, replace=False)
    sel1 = np.random.choice(idx1, size=n, replace=False)

    sel = np.concatenate([sel0, sel1])
    np.random.shuffle(sel)

    # slice out your balanced subset
    Xc_out = Xc_unbalanced[sel]
    y_out = y_unbalanced[sel]

    print("Balanced X shape:", Xc_out.shape)
    print("Balanced y counts:", np.bincount(y_out))
    return Xc_out, y_out

In [5]:
train_npz_paths = [
    "./data/doc_features/transcript_componenttext_2012_1_features.npz",    
    "./data/doc_features/transcript_componenttext_2012_2_features.npz",
    "./data/doc_features/transcript_componenttext_2013_1_features.npz",    
    "./data/doc_features/transcript_componenttext_2013_2_features.npz",
]

val_npz_paths = [
    "./data/doc_features/transcript_componenttext_2014_1_features.npz",
    "./data/doc_features/transcript_componenttext_2014_2_features.npz",
]

# test_npz_paths = [

#     "./data/doc_features/transcript_componenttext_2014_1_features.npz",
# ]


In [13]:
Xc, y = load_data(train_npz_paths, "X_max", threshold=0.5)
X_val_all_feat, y_val = load_data(val_npz_paths, "X_max", threshold=0.5)
Xc2, y2 = load_data(train_npz_paths, "X_mean", threshold=0.5)
X_val_all_feat2, y_val2 = load_data(val_npz_paths, "X_mean", threshold=0.5)

Xc = np.concatenate([Xc, Xc2], axis=1)
X_val_all_feat = np.concatenate([X_val_all_feat, X_val_all_feat2], axis=1)

X_val_all_feat, X_test_all_feat, y_val, y_test = train_test_split(
    X_val_all_feat, 
    y_val, 
    test_size=0.5,       # puts half into X_test/y_test
    random_state=42,     # for reproducibility
    # stratify=y_val       # if you want to preserve class proportions
)


Combined Xc shape: (4821, 16384)
Combined y shape:  (4821,)
Combined Xc shape: (2508, 16384)
Combined y shape:  (2508,)
Combined Xc shape: (4821, 16384)
Combined y shape:  (4821,)
Combined Xc shape: (2508, 16384)
Combined y shape:  (2508,)


In [None]:
np.concatenate([Xc, Xc2], axis=1)

(4821, 32768)

In [19]:
Xc, y = load_data(train_npz_paths, "X_mean", threshold=0.5)
X_val_all_feat, y_val = load_data(val_npz_paths, "X_mean", threshold=0.5)

# Xc, y = load_data(train_npz_paths, "X_concat", threshold=0.1)
# X_val_all_feat, y_val = load_data(val_npz_paths, "X_concat", threshold=0.1)
# X_test_all_feat, y_test = load_data(test_npz_paths, "X_mean")
# Split X_val_all_feat, y_val into two equal parts:
X_val_all_feat, X_test_all_feat, y_val, y_test = train_test_split(
    X_val_all_feat, 
    y_val, 
    test_size=0.5,       # puts half into X_test/y_test
    random_state=42,     # for reproducibility
    # stratify=y_val       # if you want to preserve class proportions
)

Combined Xc shape: (4821, 16384)
Combined y shape:  (4821,)
Combined Xc shape: (2508, 16384)
Combined y shape:  (2508,)


In [161]:
X_val_all_feat.shape

(1422, 16384)

In [162]:
y_val.shape

(1422,)

In [202]:
# optional downsampling for balancing data
Xc, y = downsample_balance(Xc, y)
X_val_all_feat, y_val = downsample_balance(X_val_all_feat, y_val)
X_test_all_feat, y_test = downsample_balance(X_test_all_feat, y_test)

Balanced X shape: (2170, 16384)
Balanced y counts: [1085 1085]
Balanced X shape: (574, 16384)
Balanced y counts: [287 287]
Balanced X shape: (498, 16384)
Balanced y counts: [249 249]


In [15]:

def rank_and_print(scores: np.ndarray, D: int, method_name: str, top_k: int = 1000):
    """
    Print the top_k features by score, separating features into 'mean' and 'max' halves.
    """
    ranked = np.argsort(-scores)
    print(f"\n=== {method_name} top {top_k} features ===")
    for rank, idx in enumerate(ranked[:top_k], start=1):
        part = 'mean' if idx < D else 'max'
        feat_id = idx if idx < D else idx - D
        score = scores[idx]
        print(f"Rank {rank:4d}: {part} feature #{feat_id} (score = {score:.6f})")

    return ranked

In [16]:
scaler = StandardScaler()
Xs = scaler.fit_transform(Xc)

D = Xc.shape[1] // 2

X_pos, X_neg = Xs[y == 1], Xs[y == 0]
t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /
                 np.sqrt(X_pos.var(0) / len(X_pos) + X_neg.var(0) / len(X_neg)))
ranked_idx = rank_and_print(t_stats, D, "T-test")


=== T-test top 1000 features ===
Rank    1: mean feature #4600 (score = 8.956881)
Rank    2: mean feature #15085 (score = 8.613231)
Rank    3: mean feature #15621 (score = 8.577414)
Rank    4: mean feature #1541 (score = 8.481013)
Rank    5: max feature #10424 (score = 8.281642)
Rank    6: max feature #12865 (score = 8.211435)
Rank    7: mean feature #10010 (score = 8.118495)
Rank    8: mean feature #12299 (score = 8.091381)
Rank    9: max feature #14370 (score = 8.071714)
Rank   10: max feature #15621 (score = 7.978019)
Rank   11: mean feature #10560 (score = 7.972969)
Rank   12: max feature #10560 (score = 7.924379)
Rank   13: max feature #10040 (score = 7.894786)
Rank   14: max feature #6615 (score = 7.783951)
Rank   15: max feature #8430 (score = 7.782514)
Rank   16: mean feature #15681 (score = 7.754803)
Rank   17: max feature #16278 (score = 7.738749)
Rank   18: max feature #1541 (score = 7.637256)
Rank   19: max feature #12299 (score = 7.611466)
Rank   20: max feature #1712 (sc

  t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /


In [20]:
scaler = StandardScaler()
Xs = scaler.fit_transform(Xc)

skb = SelectKBest(score_func=f_classif, k=1000)
skb.fit(Xc, y)
f_scores = skb.scores_
ranked_idx = rank_and_print(f_scores, D, "F-test")



=== F-test top 1000 features ===
Rank    1: mean feature #1541 (score = 90.344655)
Rank    2: mean feature #10424 (score = 85.820289)
Rank    3: mean feature #15085 (score = 84.318700)
Rank    4: mean feature #10515 (score = 80.515695)
Rank    5: mean feature #12299 (score = 79.622996)
Rank    6: mean feature #7357 (score = 76.125641)
Rank    7: mean feature #1381 (score = 74.895672)
Rank    8: mean feature #11637 (score = 72.995094)
Rank    9: mean feature #14370 (score = 72.390726)
Rank   10: mean feature #6703 (score = 71.805893)
Rank   11: mean feature #15621 (score = 70.357265)
Rank   12: mean feature #255 (score = 69.605259)
Rank   13: mean feature #10769 (score = 68.638974)
Rank   14: mean feature #1520 (score = 68.432080)
Rank   15: mean feature #3471 (score = 68.423135)
Rank   16: mean feature #10656 (score = 68.274496)
Rank   17: mean feature #9095 (score = 66.369306)
Rank   18: mean feature #13882 (score = 66.335655)
Rank   19: mean feature #1405 (score = 65.870977)
Rank   

  f = msb / msw


In [52]:
# 3) Mutual Information (filter)
mi_scores = mutual_info_classif(Xc, y, discrete_features=False)
ranked_idx = rank_and_print(mi_scores, D, "Mutual Information")



=== Mutual Information top 1000 features ===
Rank    1: max feature #7768 (score = 0.000000)
Rank    2: mean feature #3012 (score = 0.002506)
Rank    3: max feature #780 (score = 0.000000)
Rank    4: max feature #827 (score = 0.000000)
Rank    5: mean feature #7082 (score = 0.000000)
Rank    6: mean feature #4796 (score = 0.000647)
Rank    7: mean feature #1241 (score = 0.002696)
Rank    8: mean feature #6604 (score = 0.000000)
Rank    9: mean feature #6038 (score = 0.011501)
Rank   10: max feature #1904 (score = 0.000000)
Rank   11: max feature #7221 (score = 0.000000)
Rank   12: mean feature #2991 (score = 0.000000)
Rank   13: max feature #1454 (score = 0.000000)
Rank   14: mean feature #300 (score = 0.004962)
Rank   15: max feature #4406 (score = 0.000000)
Rank   16: mean feature #2017 (score = 0.007202)
Rank   17: mean feature #2371 (score = 0.000000)
Rank   18: mean feature #373 (score = 0.003534)
Rank   19: mean feature #6182 (score = 0.001276)
Rank   20: max feature #4204 (scor

array([  660, 13495,  1043, ...,  7569,  7443,  8191])

In [None]:
# 4) L1-regularized Logistic Regression (embedded)
scaler = StandardScaler()
Xs = scaler.fit_transform(Xc)

lr = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=5000)
sfm_lr = SelectFromModel(estimator=lr, max_features=1000)
sfm_lr.fit(Xs, y)
# Get absolute coefficients as scores
lr_coef_scores = np.abs(sfm_lr.estimator_.coef_).flatten()
ranked_idx = rank_and_print(lr_coef_scores, D, "L1-Logistic")

In [21]:
top_idx = ranked_idx[:1000]
X_test = X_test_all_feat[:, top_idx]
X_val = X_val_all_feat[:, top_idx]
X_top = Xc[:, top_idx]      

X_train = X_top
y_train = y


# X_test = X_test_all_feat[:, :]
# X_val = X_val_all_feat[:, :]
# X_top = Xc[:, :]      

# X_train = X_top
# y_train = y


In [29]:
# 5) Train with L1 logistic regression & balanced class weights
clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l1",
        solver="saga",
        # class_weight="balanced",
        C=1.0,
        max_iter=7000,
        random_state=42
    )
)
clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

# 6) Evaluate
y_pred   = clf.predict(X_test)
y_probs  = clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_probs))

# 7) Inspect which of your top-1000 actually got nonzero weights
lr = clf.named_steps["logisticregression"]
coefs = lr.coef_.ravel()
nz    = np.where(coefs != 0)[0]



              precision    recall  f1-score   support

           0       0.54      0.55      0.55       268
           1       0.55      0.54      0.54       268

    accuracy                           0.54       536
   macro avg       0.54      0.54      0.54       536
weighted avg       0.54      0.54      0.54       536

ROC AUC: 0.5699209177990644


In [22]:

param_grid = {"logisticregression__C": [0.0001, 0.001, 0.01, 0.1]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",    
        # solver="liblinear",    
        # class_weight="balanced",
        max_iter=10000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best C (inverse reg. strength): 0.001
CV ROC AUC: 0.712782018435753
              precision    recall  f1-score   support

           0       0.38      0.09      0.15       249
           1       0.81      0.96      0.88      1005

    accuracy                           0.79      1254
   macro avg       0.59      0.53      0.51      1254
weighted avg       0.72      0.79      0.73      1254

Test ROC AUC: 0.6285160542668184


In [186]:

param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",    
        # solver="liblinear",    
        # class_weight="balanced",
        max_iter=7000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: 

In [218]:

param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",    
        # solver="liblinear",    
        # class_weight="balanced",
        max_iter=6000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best C (inverse reg. strength): 0.001
CV ROC AUC: 0.7387063289671826
              precision    recall  f1-score   support

           0       0.33      0.09      0.14       249
           1       0.81      0.96      0.88      1005

    accuracy                           0.78      1254
   macro avg       0.57      0.52      0.51      1254
weighted avg       0.71      0.78      0.73      1254

Test ROC AUC: 0.5974305180922695


In [213]:

X_train_t = torch.from_numpy(X_train).float().to(device)
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1).to(device)
X_test_t  = torch.from_numpy(X_test).float().to(device)
y_test_t  = torch.from_numpy(y_test).float().unsqueeze(1).to(device)

# DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True)


class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_top.shape[1]).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.BCELoss()

# 4) Training loop
n_epochs = 50
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(train_dl.dataset)
    print(f"Epoch {epoch:2d}: train loss = {avg_loss:.4f}")

# 5) Evaluation
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

Epoch  1: train loss = 0.5550
Epoch  2: train loss = 0.4773
Epoch  3: train loss = 0.4398
Epoch  4: train loss = 0.4152
Epoch  5: train loss = 0.3894
Epoch  6: train loss = 0.3730
Epoch  7: train loss = 0.3489
Epoch  8: train loss = 0.3264
Epoch  9: train loss = 0.3141
Epoch 10: train loss = 0.2901
Epoch 11: train loss = 0.2751
Epoch 12: train loss = 0.2518
Epoch 13: train loss = 0.2413
Epoch 14: train loss = 0.2234
Epoch 15: train loss = 0.2052
Epoch 16: train loss = 0.1906
Epoch 17: train loss = 0.1833
Epoch 18: train loss = 0.1690
Epoch 19: train loss = 0.1585
Epoch 20: train loss = 0.1441
Epoch 21: train loss = 0.1336
Epoch 22: train loss = 0.1252
Epoch 23: train loss = 0.1164
Epoch 24: train loss = 0.1077
Epoch 25: train loss = 0.1022
Epoch 26: train loss = 0.0960
Epoch 27: train loss = 0.0844
Epoch 28: train loss = 0.0827
Epoch 29: train loss = 0.0752
Epoch 30: train loss = 0.0739
Epoch 31: train loss = 0.0658
Epoch 32: train loss = 0.0627
Epoch 33: train loss = 0.0559
Epoch 34: 

In [25]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)


scale_pos_weight = float((y_train == 0).sum()) / (y_train == 1).sum()


params = {
    "objective":        "binary:logistic",
    "eval_metric":      "auc",
    "scale_pos_weight": scale_pos_weight,
    "tree_method":      "hist",       
    "grow_policy":      "lossguide",  
    "max_depth":        6,
    "learning_rate":    0.1,
    "subsample":        0.8,
    "colsample_bytree": 0.8,
    "random_state":     42,
    "verbosity":        1
}


cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=20,
    metrics="auc",
    seed=42,
    as_pandas=True,
    verbose_eval=50
)
best_rounds = len(cv_results)
print(f"Optimal boosting rounds: {best_rounds}")


bst = xgb.train(
    params,
    dtrain,
    num_boost_round=best_rounds
)


y_prob = bst.predict(dtest)
y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

[0]	train-auc:0.79091+0.01652	test-auc:0.58206+0.02945
[50]	train-auc:1.00000+0.00000	test-auc:0.69692+0.02358
[100]	train-auc:1.00000+0.00000	test-auc:0.70410+0.02146
[150]	train-auc:1.00000+0.00000	test-auc:0.71121+0.01732
[200]	train-auc:1.00000+0.00000	test-auc:0.71543+0.01728
[206]	train-auc:1.00000+0.00000	test-auc:0.71560+0.01736
Optimal boosting rounds: 188

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.18      0.22       222
           1       0.79      0.89      0.84       793

    accuracy                           0.73      1015
   macro avg       0.55      0.53      0.53      1015
weighted avg       0.69      0.73      0.70      1015

Test ROC AUC: 0.5895618190700158


In [23]:
# 3 layer mlp

def to_tensor_dataset(X, y):
    Xt = torch.from_numpy(X).float().to(device)
    yt = torch.from_numpy(y).float().unsqueeze(1).to(device)
    return TensorDataset(Xt, yt)

train_ds = to_tensor_dataset(X_train, y_train)
val_ds   = to_tensor_dataset(X_val, y_val)
test_ds  = to_tensor_dataset(X_test, y_test)

batch_size = 32
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

pos = np.sum(y_train == 1)
neg = np.sum(y_train == 0)
pos_weight = torch.tensor(neg / pos, dtype=torch.float).to(device)

# 5) Model definition
class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_train.shape[1]).to(device)

# 4) Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=1e-5)
criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# 5) Training & Validation Loop
n_epochs = 300
best_val_loss = float('inf')

for epoch in range(1, n_epochs+1):
    # -- Training
    model.train()
    train_loss = 0.0
    for xb, yb in train_dl:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_dl.dataset)

    # -- Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(val_dl.dataset)

    print(f"Epoch {epoch:2d}  Train Loss: {train_loss:.4f}  Val Loss: {val_loss:.4f}")

    # Optional: save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")

# 6) Load best model and test evaluation
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

y_probs = []
y_true  = []
with torch.no_grad():
    for xb, yb in test_dl:
        probs = model(xb)
        y_probs.extend(probs.cpu().numpy().flatten().tolist())
        y_true .extend(yb.cpu().numpy().flatten().tolist())

y_pred = (np.array(y_probs) >= 0.5).astype(int)

print("\nTest Classification Report:")
print(classification_report(y_true, y_pred))
print("Test ROC AUC:", roc_auc_score(y_true, y_probs))


Epoch  1  Train Loss: 0.7287  Val Loss: 0.7052
Epoch  2  Train Loss: 0.7222  Val Loss: 0.6993
Epoch  3  Train Loss: 0.7098  Val Loss: 0.7035
Epoch  4  Train Loss: 0.7063  Val Loss: 0.6919
Epoch  5  Train Loss: 0.7005  Val Loss: 0.6840
Epoch  6  Train Loss: 0.6916  Val Loss: 0.6746
Epoch  7  Train Loss: 0.6873  Val Loss: 0.6716
Epoch  8  Train Loss: 0.6853  Val Loss: 0.6692
Epoch  9  Train Loss: 0.6779  Val Loss: 0.6602
Epoch 10  Train Loss: 0.6739  Val Loss: 0.6603
Epoch 11  Train Loss: 0.6646  Val Loss: 0.6576
Epoch 12  Train Loss: 0.6642  Val Loss: 0.6516
Epoch 13  Train Loss: 0.6611  Val Loss: 0.6547
Epoch 14  Train Loss: 0.6503  Val Loss: 0.6446
Epoch 15  Train Loss: 0.6475  Val Loss: 0.6369
Epoch 16  Train Loss: 0.6485  Val Loss: 0.6352
Epoch 17  Train Loss: 0.6489  Val Loss: 0.6302
Epoch 18  Train Loss: 0.6424  Val Loss: 0.6334
Epoch 19  Train Loss: 0.6356  Val Loss: 0.6293
Epoch 20  Train Loss: 0.6369  Val Loss: 0.6220
Epoch 21  Train Loss: 0.6317  Val Loss: 0.6224
Epoch 22  Tra

In [24]:
# 3 layer mlp

def to_tensor_dataset(X, y):
    Xt = torch.from_numpy(X).float().to(device)
    yt = torch.from_numpy(y).float().unsqueeze(1).to(device)
    return TensorDataset(Xt, yt)

train_ds = to_tensor_dataset(X_train, y_train)
val_ds   = to_tensor_dataset(X_val, y_val)
test_ds  = to_tensor_dataset(X_test, y_test)

batch_size = 32
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

# 3) Define the 3-layer MLP
class ThreeLayerMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = ThreeLayerMLP(input_dim=X_train.shape[1]).to(device)

# 4) Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=1e-5)
criterion = nn.BCELoss()

# 5) Training & Validation Loop
n_epochs = 500
best_val_loss = float('inf')

for epoch in range(1, n_epochs+1):
    # -- Training
    model.train()
    train_loss = 0.0
    for xb, yb in train_dl:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_dl.dataset)

    # -- Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(val_dl.dataset)

    print(f"Epoch {epoch:2d}  Train Loss: {train_loss:.4f}  Val Loss: {val_loss:.4f}")

    # Optional: save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")

# 6) Load best model and test evaluation
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

y_probs = []
y_true  = []
with torch.no_grad():
    for xb, yb in test_dl:
        probs = model(xb)
        y_probs.extend(probs.cpu().numpy().flatten().tolist())
        y_true .extend(yb.cpu().numpy().flatten().tolist())

y_pred = (np.array(y_probs) >= 0.5).astype(int)

print("\nTest Classification Report:")
print(classification_report(y_true, y_pred))
print("Test ROC AUC:", roc_auc_score(y_true, y_probs))


Epoch  1  Train Loss: 0.6794  Val Loss: 0.6720
Epoch  2  Train Loss: 0.6789  Val Loss: 0.6685
Epoch  3  Train Loss: 0.6763  Val Loss: 0.6645
Epoch  4  Train Loss: 0.6725  Val Loss: 0.6687
Epoch  5  Train Loss: 0.6671  Val Loss: 0.6643
Epoch  6  Train Loss: 0.6626  Val Loss: 0.6624
Epoch  7  Train Loss: 0.6660  Val Loss: 0.6556
Epoch  8  Train Loss: 0.6610  Val Loss: 0.6592
Epoch  9  Train Loss: 0.6593  Val Loss: 0.6531
Epoch 10  Train Loss: 0.6511  Val Loss: 0.6602
Epoch 11  Train Loss: 0.6574  Val Loss: 0.6512
Epoch 12  Train Loss: 0.6480  Val Loss: 0.6512
Epoch 13  Train Loss: 0.6436  Val Loss: 0.6455
Epoch 14  Train Loss: 0.6464  Val Loss: 0.6457
Epoch 15  Train Loss: 0.6451  Val Loss: 0.6486
Epoch 16  Train Loss: 0.6426  Val Loss: 0.6466
Epoch 17  Train Loss: 0.6390  Val Loss: 0.6403
Epoch 18  Train Loss: 0.6357  Val Loss: 0.6386
Epoch 19  Train Loss: 0.6340  Val Loss: 0.6384
Epoch 20  Train Loss: 0.6348  Val Loss: 0.6393
Epoch 21  Train Loss: 0.6279  Val Loss: 0.6362
Epoch 22  Tra

In [12]:
X_train_csr = sparse.csr_matrix(X_train)  # compress zeros
X_val_csr   = sparse.csr_matrix(X_val)
X_test_csr  = sparse.csr_matrix(X_test)

dump_svmlight_file(X_train_csr, y_train, "train.svm")
dump_svmlight_file(X_val_csr,   y_val,   "val.svm")
dump_svmlight_file(X_test_csr,  y_test,  "test.svm")


# --- 2. Build DMatrix (treat 0.0 as missing) ---
dtrain = xgb.DMatrix("train.svm?format=libsvm#train.cache", missing=0.0)
dval   = xgb.DMatrix("val.svm?format=libsvm#val.cache",     missing=0.0)
dtest  = xgb.DMatrix("test.svm?format=libsvm#test.cache",   missing=0.0)  

params = {
    'objective':        'binary:logistic',
    'eval_metric':      'auc',
    'tree_method':      'hist',
    'max_depth':        8,
    'eta':              0.1,
    'subsample':        0.8,
    'colsample_bytree': 0.8,
    'lambda':           1.0,
    'alpha':            0.5,
    'nthread':          4      
}

watchlist = [(dtrain, 'train'), (dval, 'eval')]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=20,
    verbose_eval=50
)

# --- 4. Predict & evaluate on test set ---
y_proba = bst.predict(dtest)               # shape (n_test,)
y_pred  = (y_proba > 0.5).astype(int)

acc    = accuracy_score(y_test, y_pred)
auc    = roc_auc_score(y_test, y_proba)
report = classification_report(y_test, y_pred)
cm     = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy : {acc:.4f}")
print(f"Test ROC AUC  : {auc:.4f}\n")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(cm)



[0]	train-auc:0.86449	eval-auc:0.48788
[50]	train-auc:1.00000	eval-auc:0.58003
[74]	train-auc:1.00000	eval-auc:0.57466
Test Accuracy : 0.5279
Test ROC AUC  : 0.5431

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.52      0.53       358
           1       0.53      0.53      0.53       358

    accuracy                           0.53       716
   macro avg       0.53      0.53      0.53       716
weighted avg       0.53      0.53      0.53       716

Confusion Matrix:
[[187 171]
 [167 191]]
