In [1]:
import numpy as np
import pandas as pd
import os

def prepare_attention_data():
    """
    This script prepares the data for the CEM classifier with attention pooling.
    It loads the raw, unaggregated embeddings and aligns them with concept questionnaire data.
    The output is a .npz file containing the aligned raw embeddings, concept matrix,
    labels, and subject IDs.
    """
    print("Starting: Prepare attention data")

    # Paths
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    features_path = os.path.join(project_root, "data/processed/subject_features_raw.npz")
    concepts_path = os.path.join(project_root, "data/processed/merged_questionnaires.csv")
    out_path = os.path.join(project_root, "data/processed/cem_input_raw.npz")

    # Load subject embeddings
    if not os.path.exists(features_path):
        print(f"Error: Input file not found at {features_path}")
        print("Please run 2_create_raw_embeddings.ipynb first.")
        return
        
    features = np.load(features_path, allow_pickle=True)
    X_raw = features["X_raw"]
    y_embed = features["y"]
    subject_ids = features["subject_ids"].astype(str)

    print("Loaded raw embeddings. Num subjects:", X_raw.shape[0])

    # Load concept questionnaire dataset
    if not os.path.exists(concepts_path):
        print(f"Error: Input file not found at {concepts_path}")
        return

    symptoms_df = pd.read_csv(concepts_path)
    symptoms_df["subject_id"] = symptoms_df["Subject"].astype(str).str.replace("^train_", "", regex=True)
    print("Loaded concept dataset. Shape:", symptoms_df.shape)

    # Extract concept matrix + labels
    concept_cols = [c for c in symptoms_df.columns if c not in ["Subject", "Diagnosis", "subject_id"]]
    C_all = symptoms_df[concept_cols].astype(float).values
    y_all = symptoms_df["Diagnosis"].astype(int).values
    subj_all = symptoms_df["subject_id"].values

    print("Extracted concepts. Num concepts:", len(concept_cols))

    # Align with embeddings
    df_emb = pd.DataFrame({"subject_id": subject_ids})
    df_con = pd.DataFrame({"subject_id": subj_all, "Diagnosis": y_all})
    df_merge = df_emb.merge(df_con, on="subject_id", how="inner")

    common_ids = df_merge["subject_id"].values
    mask_emb = np.isin(subject_ids, common_ids)
    mask_con = np.isin(subj_all, common_ids)

    X_aligned = X_raw[mask_emb]
    y_aligned = y_all[mask_con]
    C_aligned = C_all[mask_con]
    subject_ids_aligned = common_ids

    print("After alignment:")
    print("X_raw (num_subjects,):", X_aligned.shape, "C:", C_aligned.shape, "y:", y_aligned.shape)

    # Convert X_aligned from object array to proper float32 array
    # This fixes the numpy.object_ dtype issue
    X_aligned_float = X_aligned.astype(np.float32)
    print(f"Converted X_raw to dtype: {X_aligned_float.dtype}")

    # Save everything
    np.savez(
        out_path,
        X_raw=X_aligned_float,
        C=C_aligned.astype(np.float32),
        y=y_aligned.astype(np.int64),
        subject_ids=subject_ids_aligned,
        concept_names=np.array(concept_cols)
    )

    print(f"Saved dataset to {out_path}")
    print("Finished: Prepare attention data")

if __name__ == '__main__':
    prepare_attention_data()

Starting: Prepare attention data
Loaded raw embeddings. Num subjects: 486
Loaded concept dataset. Shape: (486, 24)
Extracted concepts. Num concepts: 21
After alignment:
X_raw (num_subjects,): (486, 20, 384) C: (486, 21) y: (486,)
Converted X_raw to dtype: float32
Saved dataset to /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/cem_input_raw.npz
Finished: Prepare attention data
