In [8]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import ttest_ind
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score


In [3]:
npz_paths = [
    "./data/doc_features/transcript_componenttext_2010_1_features.npz",
    "./data/doc_features/transcript_componenttext_2010_2_features.npz",
    "./data/doc_features/transcript_componenttext_2011_1_features.npz",
    "./data/doc_features/transcript_componenttext_2011_2_features.npz",
    "./data/doc_features/transcript_componenttext_2012_1_features.npz",
    "./data/doc_features/transcript_componenttext_2012_2_features.npz"
]

In [4]:
X_list = []
y_list = []

for npz_path in npz_paths:
 
    base = os.path.splitext(os.path.basename(npz_path))[0]      
    csv_path = os.path.join(
        os.path.dirname(npz_path),
        base + "_meta.csv"                                       
    )


    data = np.load(npz_path, allow_pickle=True)
    X_concat = data["X_concat"]      # (N_docs, 2*D)
    tids = data["transcriptids"]    


    meta = pd.read_csv(csv_path)

    meta_unique = (
        meta[["transcriptid", "SUESCORE", "label"]]
        .drop_duplicates(subset="transcriptid", keep="first")
        .set_index("transcriptid")
    )

    mask_ids = np.isin(tids, meta_unique.index)
    X_filt = X_concat[mask_ids]
    tids_filt = np.array(tids)[mask_ids]


    lab_df = meta.assign(
        label=lambda df: df.SUESCORE.map(
            lambda s: 1 if s >= 0.5 else (0 if s <= -0.5 else np.nan)
        )
    )
    mask_label = lab_df.label.notna().values
    # apply the same mask in the same order as the CSV, so we use .loc on lab_df
    # but first filter lab_df to only those transcriptids in tids_filt
    Xc, y = X_filt[mask_label], meta.loc[mask_label, "label"].astype(int).values
    
    # now align X and y
    # X_final = X_filt[lab_sub.label.notna()]
    # y_final = lab_sub.label.astype(int).values

    # collect
    X_list.append(Xc)
    y_list.append(y)

# 2. concatenate all files together
Xc = np.vstack(X_list)   # shape: (sum_i N_i, 2*D)
y  = np.concatenate(y_list)  # shape: (sum_i N_i,)

print("Combined Xc shape:", Xc.shape)
print("Combined y shape: ", y.shape)

Combined Xc shape: (5933, 32768)
Combined y shape:  (5933,)


In [None]:
# # forced resampling
# idx0 = np.where(y == 0)[0]
# idx1 = np.where(y == 1)[0]

# n = min(len(idx0), len(idx1))

# sel0 = np.random.choice(idx0, size=n, replace=False)
# sel1 = np.random.choice(idx1, size=n, replace=False)

# sel = np.concatenate([sel0, sel1])
# np.random.shuffle(sel)

# # slice out your balanced subset
# Xc = Xc[sel]
# y = y[sel]

# print("Balanced X shape:", Xc.shape)
# print("Balanced y counts:", np.bincount(y))

Balanced X shape: (2484, 32768)
Balanced y counts: [1242 1242]


In [5]:

# # 2) Build labels and mask
# meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
# mask = meta.label.notna().values
# Xc, y = Xc_aligned[mask], meta.loc[mask, "label"].astype(int).values

# 3A) Univariate t‐tests on each of the 2D features
D2 = Xc.shape[1]
D = D2 // 2
X_pos, X_neg = Xc[y==1], Xc[y==0]
t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /
                 np.sqrt(X_pos.var(0)/len(X_pos) + X_neg.var(0)/len(X_neg)))
# rank features
ranked_idx = np.argsort(-t_stats)

# Inspect top‐10:
for rank, idx in enumerate(ranked_idx[:1000], start=1):
    # print(idx)
    part = "mean" if idx < D else "max"
    # print(idx)
    # print(D)
    feat_id = idx if idx < D else idx-D
    t_val   = t_stats[idx]
    print(f"Rank {rank:2d}: {part!r} feature #{feat_id} (t = {t_val:.2f})")



Rank  1: 'mean' feature #13877 (t = 11.22)
Rank  2: 'mean' feature #8430 (t = 9.67)
Rank  3: 'max' feature #15621 (t = 9.66)
Rank  4: 'max' feature #15738 (t = 9.33)
Rank  5: 'mean' feature #12663 (t = 9.33)
Rank  6: 'mean' feature #8380 (t = 9.29)
Rank  7: 'mean' feature #14370 (t = 9.20)
Rank  8: 'mean' feature #3147 (t = 9.09)
Rank  9: 'mean' feature #5612 (t = 9.09)
Rank 10: 'mean' feature #9813 (t = 8.98)
Rank 11: 'max' feature #11273 (t = 8.96)
Rank 12: 'max' feature #14892 (t = 8.92)
Rank 13: 'max' feature #15085 (t = 8.88)
Rank 14: 'mean' feature #10424 (t = 8.85)
Rank 15: 'mean' feature #13118 (t = 8.77)
Rank 16: 'max' feature #13000 (t = 8.73)
Rank 17: 'max' feature #8380 (t = 8.66)
Rank 18: 'mean' feature #1520 (t = 8.59)
Rank 19: 'max' feature #1881 (t = 8.57)
Rank 20: 'max' feature #10010 (t = 8.56)
Rank 21: 'max' feature #4983 (t = 8.54)
Rank 22: 'mean' feature #15621 (t = 8.53)
Rank 23: 'mean' feature #8823 (t = 8.46)
Rank 24: 'mean' feature #1712 (t = 8.38)
Rank 25: 'me

  t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /


In [6]:
top_idx = ranked_idx[:1000]
X_top = Xc[:, top_idx]      

# 4) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_top, y, stratify=y, test_size=0.2, random_state=42
)


In [None]:
# 5) Train with L1 logistic regression & balanced class weights
clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l1",
        solver="saga",
        # class_weight="balanced",
        C=1.0,
        max_iter=2000,
        random_state=42
    )
)
clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

# 6) Evaluate
y_pred   = clf.predict(X_test)
y_probs  = clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_probs))

# 7) Inspect which of your top-1000 actually got nonzero weights
lr = clf.named_steps["logisticregression"]
coefs = lr.coef_.ravel()
nz    = np.where(coefs != 0)[0]

print(f"\nOut of your 1000 features, {len(nz)} have nonzero weight.")
print("Nonzero feature indices (within the top-1000 list):", nz)

In [7]:

param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",         
        class_weight="balanced",
        max_iter=2000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)


best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best C (inverse reg. strength): 0.01
CV ROC AUC: 0.6935088208936653
              precision    recall  f1-score   support

           0       0.34      0.58      0.43       248
           1       0.86      0.70      0.77       939

    accuracy                           0.67      1187
   macro avg       0.60      0.64      0.60      1187
weighted avg       0.75      0.67      0.70      1187

Test ROC AUC: 0.699925280840977


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_t = torch.from_numpy(X_train).float().to(device)
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1).to(device)
X_test_t  = torch.from_numpy(X_test).float().to(device)
y_test_t  = torch.from_numpy(y_test).float().unsqueeze(1).to(device)

# DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True)


class ShallowMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ShallowMLP(input_dim=X_top.shape[1]).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCELoss()

# 4) Training loop
n_epochs = 50
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(train_dl.dataset)
    print(f"Epoch {epoch:2d}: train loss = {avg_loss:.4f}")

# 5) Evaluation
model.eval()
with torch.no_grad():
    y_prob = model(X_test_t).cpu().numpy().flatten()
    y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

Epoch  1: train loss = 0.5063
Epoch  2: train loss = 0.4638
Epoch  3: train loss = 0.4405
Epoch  4: train loss = 0.4231
Epoch  5: train loss = 0.4129
Epoch  6: train loss = 0.3968
Epoch  7: train loss = 0.3788
Epoch  8: train loss = 0.3615
Epoch  9: train loss = 0.3524
Epoch 10: train loss = 0.3356
Epoch 11: train loss = 0.3155
Epoch 12: train loss = 0.3109
Epoch 13: train loss = 0.2932
Epoch 14: train loss = 0.2752
Epoch 15: train loss = 0.2571
Epoch 16: train loss = 0.2478
Epoch 17: train loss = 0.2299
Epoch 18: train loss = 0.2304
Epoch 19: train loss = 0.2088
Epoch 20: train loss = 0.1959
Epoch 21: train loss = 0.1965
Epoch 22: train loss = 0.1854
Epoch 23: train loss = 0.1789
Epoch 24: train loss = 0.1545
Epoch 25: train loss = 0.1550
Epoch 26: train loss = 0.1547
Epoch 27: train loss = 0.1497
Epoch 28: train loss = 0.1436
Epoch 29: train loss = 0.1326
Epoch 30: train loss = 0.1396
Epoch 31: train loss = 0.1207
Epoch 32: train loss = 0.1241
Epoch 33: train loss = 0.1290
Epoch 34: 

: 