In [1]:
import numpy as np
import pandas as pd
import os

In [None]:
# Paths 
features_path = "../data/processed/subject_features.npz"   # from SBERT step dataretrieval step
concepts_path = "../data/processed/merged_questionnaires.csv"   # questionnaires merged
out_path      = "../data/processed/cem_input.npz"

In [None]:
# Load subject embeddings
features = np.load(features_path, allow_pickle=True)
X = features["X"]
y_embed = features["y"]
subject_ids = features["subject_ids"]

# Make strings for merge
subject_ids = subject_ids.astype(str)

print("Embeddings:", X.shape, "Labels:", y_embed.shape)

Embeddings: (486, 1152) Labels: (486,)


In [None]:
#Load concept questionnaire dataset 
symptoms_df = pd.read_csv(concepts_path)

# Normalize subject_id format (drop "train_" prefix if needed)
symptoms_df["subject_id"] = symptoms_df["Subject"].astype(str).str.replace("^train_", "", regex=True)

print("Concept dataset:", symptoms_df.shape)
print("Columns:", symptoms_df.columns.tolist())



Concept dataset: (486, 24)
Columns: ['Subject', 'Sadness', 'Pessimism', 'Past failure', 'Loss of pleasure', 'Guilty feelings', 'Punishment feelings', 'Self-dislike', 'Self-criticalness', 'Suicidal thoughts or wishes', 'Crying', 'Agitation', 'Loss of interest', 'Indecisiveness', 'Worthlessness', 'Loss of energy', 'Changes in sleeping pattern', 'Irritability', 'Changes in appetite', 'Concentration difficulty', 'Tiredness or fatigue', 'Loss of interest in sex', 'Diagnosis', 'subject_id']


In [6]:
# Extract concept matrix + labels
concept_cols = [c for c in symptoms_df.columns if c not in ["Subject", "Diagnosis", "subject_id"]]
C_all = symptoms_df[concept_cols].astype(float).values
y_all = symptoms_df["Diagnosis"].astype(int).values
subj_all = symptoms_df["subject_id"].values

print("Concepts:", len(concept_cols), " -> ", concept_cols[:5], "...")



Concepts: 21  ->  ['Sadness', 'Pessimism', 'Past failure', 'Loss of pleasure', 'Guilty feelings'] ...


In [7]:
# Align with embeddings
df_emb = pd.DataFrame({"subject_id": subject_ids})
df_con = pd.DataFrame({"subject_id": subj_all, "Diagnosis": y_all})
df_merge = df_emb.merge(df_con, on="subject_id", how="inner")

# Find common indices
common_ids = df_merge["subject_id"].values
mask_emb = np.isin(subject_ids, common_ids)
mask_con = np.isin(subj_all, common_ids)

X_aligned = X[mask_emb]
y_aligned = y_all[mask_con]
C_aligned = C_all[mask_con]
subject_ids_aligned = common_ids

print("After alignment:")
print("X:", X_aligned.shape, "C:", C_aligned.shape, "y:", y_aligned.shape)

After alignment:
X: (486, 1152) C: (486, 21) y: (486,)


In [8]:
# Save everything
np.savez(
    out_path,
    X=X_aligned.astype(np.float32),
    C=C_aligned.astype(np.float32),
    y=y_aligned.astype(np.int64),
    subject_ids=subject_ids_aligned,
    concept_names=np.array(concept_cols)
)

print(f"Saved dataset to {out_path}")

Saved dataset to ../data/processed/cem_input.npz
