In [26]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import ttest_ind
from scipy import sparse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb


In [2]:
# 1) Load concat features and metadata
data = np.load("./data/doc_features/transcript_componenttext_2012_2_features.npz", allow_pickle=True)

# Xc = data["X_concat"]
Xc = data["X_max"]        # shape (N_docs, 2*D)
tids = data["transcriptids"]  # same order
meta = pd.read_csv("./data/doc_features/transcript_componenttext_2012_2_features_meta.csv")

meta_unique = (
    meta[["transcriptid", "SUESCORE", "label"]]
    .drop_duplicates(subset="transcriptid", keep="first")
    .set_index("transcriptid")
)

# 3) build a mask over tids
mask = np.isin(tids, meta_unique.index)

Xc = Xc[mask]
tids = tids[mask]

# 2) Build labels and mask
meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
mask = meta.label.notna().values
X_test_all_feat, y_test = Xc[mask], meta.loc[mask, "label"].astype(int).values

In [3]:
# 1) Load concat features and metadata
data = np.load("./data/doc_features/transcript_componenttext_2012_1_features.npz", allow_pickle=True)

Xc = data["X_max"]        # shape (N_docs, 2*D)
tids = data["transcriptids"]  # same order
meta = pd.read_csv("./data/doc_features/transcript_componenttext_2012_1_features_meta.csv")

meta_unique = (
    meta[["transcriptid", "SUESCORE", "label"]]
    .drop_duplicates(subset="transcriptid", keep="first")
    .set_index("transcriptid")
)

# 3) build a mask over tids
mask = np.isin(tids, meta_unique.index)

Xc = Xc[mask]
tids = tids[mask]

# 2) Build labels and mask
meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
mask = meta.label.notna().values
X_val_all_feat, y_val = Xc[mask], meta.loc[mask, "label"].astype(int).values

In [4]:
npz_paths = [
    "./data/doc_features/transcript_componenttext_2010_1_features.npz",
    "./data/doc_features/transcript_componenttext_2010_2_features.npz",
    "./data/doc_features/transcript_componenttext_2011_1_features.npz",
    "./data/doc_features/transcript_componenttext_2011_2_features.npz",
    # "./data/doc_features/transcript_componenttext_2012_1_features.npz",
]

In [5]:
X_list = []
y_list = []

for npz_path in npz_paths:
 
    base = os.path.splitext(os.path.basename(npz_path))[0]      
    csv_path = os.path.join(
        os.path.dirname(npz_path),
        base + "_meta.csv"                                       
    )


    data = np.load(npz_path, allow_pickle=True)
    # X_concat = data["X_concat"]      # (N_docs, 2*D)
    X_concat = data["X_max"]
    tids = data["transcriptids"]    


    meta = pd.read_csv(csv_path)

    meta_unique = (
        meta[["transcriptid", "SUESCORE", "label"]]
        .drop_duplicates(subset="transcriptid", keep="first")
        .set_index("transcriptid")
    )

    mask_ids = np.isin(tids, meta_unique.index)
    X_filt = X_concat[mask_ids]
    tids_filt = np.array(tids)[mask_ids]


    lab_df = meta.assign(
        label=lambda df: df.SUESCORE.map(
            lambda s: 1 if s >= 0.5 else (0 if s <= -0.5 else np.nan)
        )
    )
    mask_label = lab_df.label.notna().values
    # apply the same mask in the same order as the CSV, so we use .loc on lab_df
    # but first filter lab_df to only those transcriptids in tids_filt
    Xc, y = X_filt[mask_label], meta.loc[mask_label, "label"].astype(int).values
    
    # now align X and y
    # X_final = X_filt[lab_sub.label.notna()]
    # y_final = lab_sub.label.astype(int).values

    # collect
    X_list.append(Xc)
    y_list.append(y)

# 2. concatenate all files together
Xc = np.vstack(X_list)   # shape: (sum_i N_i, 2*D)
y  = np.concatenate(y_list)  # shape: (sum_i N_i,)

print("Combined Xc shape:", Xc.shape)
print("Combined y shape: ", y.shape)

Combined Xc shape: (3561, 16384)
Combined y shape:  (3561,)


In [5]:
# forced resampling
idx0 = np.where(y == 0)[0]
idx1 = np.where(y == 1)[0]

n = min(len(idx0), len(idx1))

sel0 = np.random.choice(idx0, size=n, replace=False)
sel1 = np.random.choice(idx1, size=n, replace=False)

sel = np.concatenate([sel0, sel1])
np.random.shuffle(sel)

# slice out your balanced subset
Xc = Xc[sel]
y = y[sel]

print("Balanced X shape:", Xc.shape)
print("Balanced y counts:", np.bincount(y))

Balanced X shape: (2040, 32768)
Balanced y counts: [1020 1020]


In [6]:


# meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
# mask = meta.label.notna().values
# Xc, y = Xc_aligned[mask], meta.loc[mask, "label"].astype(int).values

D2 = Xc.shape[1]
D = D2 // 2
X_pos, X_neg = Xc[y==1], Xc[y==0]
t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /
                 np.sqrt(X_pos.var(0)/len(X_pos) + X_neg.var(0)/len(X_neg)))

ranked_idx = np.argsort(-t_stats)


for rank, idx in enumerate(ranked_idx[:1000], start=1):
    # print(idx)
    part = "mean" if idx < D else "max"
    # print(idx)
    # print(D)
    feat_id = idx if idx < D else idx-D
    t_val   = t_stats[idx]
    print(f"Rank {rank:2d}: {part!r} feature #{feat_id} (t = {t_val:.2f})")



Rank  1: 'max' feature #3081 (t = 8.83)
Rank  2: 'max' feature #7546 (t = 8.71)
Rank  3: 'max' feature #3581 (t = 8.57)
Rank  4: 'max' feature #6700 (t = 8.34)
Rank  5: 'mean' feature #3585 (t = 8.33)
Rank  6: 'mean' feature #1881 (t = 8.12)
Rank  7: 'max' feature #4620 (t = 8.06)
Rank  8: 'max' feature #7429 (t = 7.78)
Rank  9: 'max' feature #5353 (t = 7.78)
Rank 10: 'max' feature #5888 (t = 7.76)
Rank 11: 'max' feature #5425 (t = 7.68)
Rank 12: 'mean' feature #2366 (t = 7.65)
Rank 13: 'max' feature #2867 (t = 7.63)
Rank 14: 'max' feature #188 (t = 7.52)
Rank 15: 'mean' feature #1547 (t = 7.45)
Rank 16: 'mean' feature #1211 (t = 7.22)
Rank 17: 'max' feature #3929 (t = 7.11)
Rank 18: 'max' feature #8086 (t = 7.07)
Rank 19: 'max' feature #4883 (t = 7.04)
Rank 20: 'mean' feature #5240 (t = 6.95)
Rank 21: 'mean' feature #2845 (t = 6.94)
Rank 22: 'max' feature #6893 (t = 6.88)
Rank 23: 'max' feature #4107 (t = 6.84)
Rank 24: 'mean' feature #4983 (t = 6.80)
Rank 25: 'max' feature #4808 (t =

  t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /


In [30]:
top_idx = ranked_idx[:1000]
X_test = X_test_all_feat[:, top_idx]
X_val = X_val_all_feat[:, top_idx]
X_top = Xc[:, top_idx]      

X_train = X_top
y_train = y


In [47]:
# 5) Train with L1 logistic regression & balanced class weights
clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l1",
        solver="saga",
        # class_weight="balanced",
        C=1.0,
        max_iter=2000,
        random_state=42
    )
)
clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

# 6) Evaluate
y_pred   = clf.predict(X_test)
y_probs  = clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_probs))

# 7) Inspect which of your top-1000 actually got nonzero weights
lr = clf.named_steps["logisticregression"]
coefs = lr.coef_.ravel()
nz    = np.where(coefs != 0)[0]



              precision    recall  f1-score   support

           0       0.25      0.26      0.26       222
           1       0.79      0.79      0.79       793

    accuracy                           0.67      1015
   macro avg       0.52      0.52      0.52      1015
weighted avg       0.67      0.67      0.67      1015

ROC AUC: 0.5416141235813366


In [33]:

param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",    
        # solver="liblinear",    
        # class_weight="balanced",
        max_iter=7000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

# X_sparse = sparse.csr_matrix(X_train) 
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best C (inverse reg. strength): 0.01
CV ROC AUC: 0.7284655699201593
              precision    recall  f1-score   support

           0       0.32      0.21      0.25       222
           1       0.80      0.87      0.83       793

    accuracy                           0.73      1015
   macro avg       0.56      0.54      0.54      1015
weighted avg       0.69      0.73      0.71      1015

Test ROC AUC: 0.5598195926064778


In [None]:
# 1) Build your sparse training matrix once
X_sparse = sparse.csr_matrix(X_train)

# 2) Swap out StandardScaler (which densifies) for MaxAbsScaler
#    or StandardScaler(with_mean=False) so we never convert to dense.
# 3) Change penalty to "l1" (saga supports L1) to get a sparse model.
param_grid = {
    "logisticregression__C": [0.01, 0.1, 1, 10, 100]
}

pipeline = make_pipeline(
    MaxAbsScaler(),  
    LogisticRegression(
        penalty="l1",
        solver="saga",
        max_iter=7000,
        random_state=42,
        n_jobs=-1
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

# 4) Fit on your CSR matrix
search.fit(X_sparse, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)

# 5) Evaluate on your (also sparse) test set
X_test_sparse = sparse.csr_matrix(X_test)
best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test_sparse)[:, 1]
y_pred       = best_clf.predict(X_test_sparse)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


Traceback (most recent call last):
  File "/home/airlay88/surprise_sae/venv/lib/python3.10/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py", line 422, in _on_run
    cmd.send(self.sock)
  File "/home/airlay88/surprise_sae/venv/lib/python3.10/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_net_command.py", line 111, in send
    sock.sendall(("Content-Length: %s\r\n\r\n" % len(as_bytes)).encode("ascii"))
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/airlay88/surprise_sae/venv/lib/python3.10/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_daemon_thread.py", line 53, in run
    self._on_run()
  File "/home/airlay88/surprise_sae/venv/lib/python3.10/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py", line 432, in _on_run
    self.py_db.dispose_and_kill_all_pydevd_threads()
  File "/home/airlay88/surprise_sae/venv/lib

Best C (inverse reg. strength): 1
CV ROC AUC: 0.7155618941404921
              precision    recall  f1-score   support

           0       0.27      0.15      0.19       222
           1       0.79      0.89      0.84       793

    accuracy                           0.73      1015
   macro avg       0.53      0.52      0.51      1015
weighted avg       0.68      0.73      0.69      1015

Test ROC AUC: 0.5670449768810425


In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_t = torch.from_numpy(X_train).float().to(device)
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1).to(device)
X_test_t  = torch.from_numpy(X_test).float().to(device)
y_test_t  = torch.from_numpy(y_test).float().unsqueeze(1).to(device)

# DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True)


class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_top.shape[1]).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCELoss()

# 4) Training loop
n_epochs = 30
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(train_dl.dataset)
    print(f"Epoch {epoch:2d}: train loss = {avg_loss:.4f}")

# 5) Evaluation
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

Epoch  1: train loss = 0.4804
Epoch  2: train loss = 0.4061
Epoch  3: train loss = 0.3606
Epoch  4: train loss = 0.3332
Epoch  5: train loss = 0.2996
Epoch  6: train loss = 0.2704
Epoch  7: train loss = 0.2355
Epoch  8: train loss = 0.2111
Epoch  9: train loss = 0.1851
Epoch 10: train loss = 0.1704
Epoch 11: train loss = 0.1536
Epoch 12: train loss = 0.1352
Epoch 13: train loss = 0.1236
Epoch 14: train loss = 0.1108
Epoch 15: train loss = 0.1030
Epoch 16: train loss = 0.0836
Epoch 17: train loss = 0.0814
Epoch 18: train loss = 0.0670
Epoch 19: train loss = 0.0660
Epoch 20: train loss = 0.0612
Epoch 21: train loss = 0.0560
Epoch 22: train loss = 0.0518
Epoch 23: train loss = 0.0595
Epoch 24: train loss = 0.0611
Epoch 25: train loss = 0.0514
Epoch 26: train loss = 0.0486
Epoch 27: train loss = 0.0483
Epoch 28: train loss = 0.0520
Epoch 29: train loss = 0.0522
Epoch 30: train loss = 0.0445

Classification Report:
              precision    recall  f1-score   support

           0       0.

In [49]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)


scale_pos_weight = float((y_train == 0).sum()) / (y_train == 1).sum()


params = {
    "objective":        "binary:logistic",
    "eval_metric":      "auc",
    "scale_pos_weight": scale_pos_weight,
    "tree_method":      "hist",       
    "grow_policy":      "lossguide",  
    "max_depth":        6,
    "learning_rate":    0.1,
    "subsample":        0.8,
    "colsample_bytree": 0.8,
    "random_state":     42,
    "verbosity":        1
}


cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=20,
    metrics="auc",
    seed=42,
    as_pandas=True,
    verbose_eval=50
)
best_rounds = len(cv_results)
print(f"Optimal boosting rounds: {best_rounds}")


bst = xgb.train(
    params,
    dtrain,
    num_boost_round=best_rounds
)


y_prob = bst.predict(dtest)
y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

[0]	train-auc:0.79059+0.01417	test-auc:0.59616+0.04367
[50]	train-auc:0.99999+0.00001	test-auc:0.69730+0.02020
[100]	train-auc:1.00000+0.00000	test-auc:0.70976+0.02243
[131]	train-auc:1.00000+0.00000	test-auc:0.71024+0.02205
Optimal boosting rounds: 112

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.23      0.27       222
           1       0.80      0.87      0.83       793

    accuracy                           0.73      1015
   macro avg       0.57      0.55      0.55      1015
weighted avg       0.70      0.73      0.71      1015

Test ROC AUC: 0.5972075480272201


In [38]:
# 3) Convert to tensors and move to device
def to_tensor(x, y):
    xt = torch.from_numpy(x).float().to(device)
    yt = torch.from_numpy(y).float().unsqueeze(1).to(device)
    return xt, yt

X_tr_t, y_tr_t = to_tensor(X_train, y_train)
X_val_t, y_val_t = to_tensor(X_val, y_val)
X_test_t, y_test_t = to_tensor(X_test, y_test)

# 4) DataLoaders
batch_size = 32
train_ds = TensorDataset(X_tr_t, y_tr_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
val_dl   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

# 5) Model definition
class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_train.shape[1]).to(device)

# 6) Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCELoss()

# 7) Training with validation
n_epochs = 20
for epoch in range(1, n_epochs + 1):
    # Training
    model.train()
    total_tr_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_tr_loss += loss.item() * xb.size(0)
    avg_tr_loss = total_tr_loss / len(train_dl.dataset)

    # Validation
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_dl:
            pred = model(xb)
            loss = criterion(pred, yb)
            total_val_loss += loss.item() * xb.size(0)
    avg_val_loss = total_val_loss / len(val_dl.dataset)

    print(f"Epoch {epoch:2d}: train_loss = {avg_tr_loss:.4f}, val_loss = {avg_val_loss:.4f}")

# 8) Final evaluation on test set
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))


Epoch  1: train_loss = 0.4802, val_loss = 0.8715
Epoch  2: train_loss = 0.3954, val_loss = 0.5293
Epoch  3: train_loss = 0.3588, val_loss = 1.3259
Epoch  4: train_loss = 0.3188, val_loss = 4.0063
Epoch  5: train_loss = 0.2955, val_loss = 10.1170
Epoch  6: train_loss = 0.2569, val_loss = 15.0498
Epoch  7: train_loss = 0.2334, val_loss = 15.9916
Epoch  8: train_loss = 0.2077, val_loss = 20.1106
Epoch  9: train_loss = 0.1821, val_loss = 26.2765
Epoch 10: train_loss = 0.1627, val_loss = 27.4486
Epoch 11: train_loss = 0.1380, val_loss = 36.5073
Epoch 12: train_loss = 0.1262, val_loss = 41.6051
Epoch 13: train_loss = 0.1112, val_loss = 41.5592
Epoch 14: train_loss = 0.1124, val_loss = 42.2641
Epoch 15: train_loss = 0.0900, val_loss = 52.6739
Epoch 16: train_loss = 0.0947, val_loss = 46.9765
Epoch 17: train_loss = 0.0870, val_loss = 57.6871
Epoch 18: train_loss = 0.0766, val_loss = 49.2754
Epoch 19: train_loss = 0.0774, val_loss = 64.2224
Epoch 20: train_loss = 0.0720, val_loss = 66.2575

Cla