In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score


In [4]:
# 1) Load concat features and metadata
data = np.load("./data/doc_features/transcript_componenttext_2010_1_features.npz", allow_pickle=True)

Xc = data["X_concat"]        # shape (N_docs, 2*D)
tids = data["transcriptids"]  # same order
meta = pd.read_csv("./data/doc_features/transcript_componenttext_2010_1_features_meta.csv")

meta_unique = (
    meta[["transcriptid", "SUESCORE", "label"]]
    .drop_duplicates(subset="transcriptid", keep="first")
    .set_index("transcriptid")
)

# 3) build a mask over tids
mask = np.isin(tids, meta_unique.index)

Xc = Xc[mask]
tids = tids[mask]

# 2) Build labels and mask
meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
mask = meta.label.notna().values
Xc, y = Xc[mask], meta.loc[mask, "label"].astype(int).values

In [3]:

# # 2) Build labels and mask
# meta = meta.assign(label=lambda df: df.SUESCORE.map(lambda s: 1 if s>=0.5 else (0 if s<=-0.5 else np.nan)))
# mask = meta.label.notna().values
# Xc, y = Xc_aligned[mask], meta.loc[mask, "label"].astype(int).values

# 3A) Univariate t‐tests on each of the 2D features
D2 = Xc.shape[1]
D = D2 // 2
X_pos, X_neg = Xc[y==1], Xc[y==0]
t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /
                 np.sqrt(X_pos.var(0)/len(X_pos) + X_neg.var(0)/len(X_neg)))
# rank features
ranked_idx = np.argsort(-t_stats)

# Inspect top‐10:
for rank, idx in enumerate(ranked_idx[:1000], start=1):
    # print(idx)
    part = "mean" if idx < D else "max"
    # print(idx)
    # print(D)
    feat_id = idx if idx < D else idx-D
    t_val   = t_stats[idx]
    print(f"Rank {rank:2d}: {part!r} feature #{feat_id} (t = {t_val:.2f})")



Rank  1: 'mean' feature #14772 (t = 5.92)
Rank  2: 'mean' feature #13847 (t = 5.40)
Rank  3: 'mean' feature #2306 (t = 5.39)
Rank  4: 'mean' feature #5230 (t = 5.37)
Rank  5: 'max' feature #5048 (t = 5.27)
Rank  6: 'mean' feature #2770 (t = 5.21)
Rank  7: 'mean' feature #14413 (t = 5.16)
Rank  8: 'mean' feature #12414 (t = 5.16)
Rank  9: 'mean' feature #13244 (t = 5.03)
Rank 10: 'mean' feature #12644 (t = 5.02)
Rank 11: 'mean' feature #2440 (t = 4.96)
Rank 12: 'max' feature #12350 (t = 4.94)
Rank 13: 'max' feature #4741 (t = 4.93)
Rank 14: 'mean' feature #2281 (t = 4.93)
Rank 15: 'mean' feature #3697 (t = 4.89)
Rank 16: 'mean' feature #2305 (t = 4.83)
Rank 17: 'mean' feature #2448 (t = 4.80)
Rank 18: 'max' feature #4660 (t = 4.80)
Rank 19: 'max' feature #11145 (t = 4.75)
Rank 20: 'max' feature #10223 (t = 4.71)
Rank 21: 'mean' feature #14892 (t = 4.70)
Rank 22: 'max' feature #15016 (t = 4.69)
Rank 23: 'mean' feature #9833 (t = 4.66)
Rank 24: 'max' feature #13065 (t = 4.65)
Rank 25: 'me

  t_stats = np.abs((X_pos.mean(0) - X_neg.mean(0)) /


In [None]:
# 3B) L1‐regularized logistic regression
# clf = make_pipeline(
#     StandardScaler(),
#     LogisticRegression(penalty="l1", solver="saga", C=1.0, max_iter=2000)
# )
# clf.fit(Xc, y)
# coef = clf.named_steps["logisticregression"].coef_.ravel()

# # Get nonzero weights and sort
# nz = np.where(coef != 0)[0]
# nz_sorted = sorted(nz, key=lambda i: -abs(coef[i]))

# # Top‐5 L1 features:
# for rank, idx in enumerate(nz_sorted[:1000], 1):
#     part = "mean" if idx < D else "max"
#     feat_id = idx if idx < D else idx-D
#     print(f"L1 Rank {rank:2d}: {part!r} feature #{feat_id} (weight = {coef[idx]:.4f})")

In [None]:
top_idx = ranked_idx[:1000]
top_idx = ranked_idx[:1000]
X_top = Xc[:, top_idx]      

# 4) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_top, y, stratify=y, test_size=0.2, random_state=42
)


In [None]:
# 5) Train with L1 logistic regression & balanced class weights
clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l1",
        solver="saga",
        class_weight="balanced",
        C=1.0,
        max_iter=2000,
        random_state=42
    )
)
clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

# 6) Evaluate
y_pred   = clf.predict(X_test)
y_probs  = clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_probs))

# 7) Inspect which of your top-1000 actually got nonzero weights
lr = clf.named_steps["logisticregression"]
coefs = lr.coef_.ravel()
nz    = np.where(coefs != 0)[0]

print(f"\nOut of your 1000 features, {len(nz)} have nonzero weight.")
print("Nonzero feature indices (within the top-1000 list):", nz)

              precision    recall  f1-score   support

           0       0.52      0.64      0.57        25
           1       0.93      0.88      0.90       126

    accuracy                           0.84       151
   macro avg       0.72      0.76      0.74       151
weighted avg       0.86      0.84      0.85       151

ROC AUC: 0.9092063492063491

Out of your 1000 features, 523 have nonzero weight.
Nonzero feature indices (within the top-1000 list): [  2   4   8  10  12  14  15  16  17  19  20  21  22  23  24  25  27  28
  29  30  31  33  34  36  39  42  43  44  50  52  53  54  55  56  58  61
  64  68  69  70  71  75  82  83  84  85  87  90  92  98  99 100 101 103
 104 106 110 112 114 118 120 123 126 128 131 132 133 134 135 137 138 140
 142 143 146 147 149 151 153 154 156 161 163 165 170 172 177 179 180 182
 183 184 186 187 189 192 193 195 196 198 201 204 209 210 212 213 215 216
 220 224 226 228 229 230 231 232 234 235 236 237 238 241 249 251 252 254
 255 256 258 262 264 265 269 



In [8]:
# 4) Logistic with L2 penalty
#    You can grid-search C to find the best regularization strength
param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="saga",         # saga supports both l1 and l2
        class_weight="balanced",
        max_iter=2000,
        random_state=42
    )
)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best C (inverse reg. strength):", search.best_params_["logisticregression__C"])
print("CV ROC AUC:", search.best_score_)

# 5) Final evaluation on held-out test set
best_clf = search.best_estimator_
y_pred_probs = best_clf.predict_proba(X_test)[:, 1]
y_pred       = best_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_pred_probs))

Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best C (inverse reg. strength): 0.01
CV ROC AUC: 0.5917538896746819
              precision    recall  f1-score   support

           0       0.18      0.48      0.26        25
           1       0.84      0.56      0.67       126

    accuracy                           0.54       151
   macro avg       0.51      0.52      0.46       151
weighted avg       0.73      0.54      0.60       151

Test ROC AUC: 0.5314285714285714


