# Aggregate Embeddings for Regular CEM Model

This notebook aggregates raw embeddings into fixed-size feature vectors for the regular (non-attention) CEM model.

Input:
- `subject_features_raw.npz` - Training raw embeddings
- `subject_features_raw_validation.npz` - Validation raw embeddings
- `subject_features_raw_test.npz` - Test raw embeddings
- `merged_questionnaires.csv` - Training concept labels

Output:
- `cem_input.npz` - Training aggregated data (X, C, y)
- `cem_input_validation.npz` - Validation aggregated data (X, C=zeros, y)
- `cem_input_test.npz` - Test aggregated data (X, C=zeros, y)

Aggregation: mean, std, max across all posts per subject â†’ shape (n_subjects, 384*3)

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# Paths
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data/processed")

# Input files
train_raw = os.path.join(data_dir, "subject_features_raw.npz")
val_raw = os.path.join(data_dir, "subject_features_raw_validation.npz")
test_raw = os.path.join(data_dir, "subject_features_raw_test.npz")
concepts_path = os.path.join(data_dir, "merged_questionnaires.csv")

# Output files
train_output = os.path.join(data_dir, "cem_input.npz")
val_output = os.path.join(data_dir, "cem_input_validation.npz")
test_output = os.path.join(data_dir, "cem_input_test.npz")

print(f"Data directory: {data_dir}")

Data directory: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed


In [3]:
# Define concept names (21 BDI-II concepts)
concept_names = np.array([
    "Sadness", "Pessimism", "Past failure", "Loss of pleasure",
    "Guilty feelings", "Punishment feelings", "Self-dislike", "Self-criticalness",
    "Suicidal thoughts or wishes", "Crying", "Agitation", "Loss of interest",
    "Indecisiveness", "Worthlessness", "Loss of energy", "Changes in sleeping pattern",
    "Irritability", "Changes in appetite", "Concentration difficulty",
    "Tiredness or fatigue", "Loss of interest in sex"
])

n_concepts = len(concept_names)
print(f"Number of concepts: {n_concepts}")

Number of concepts: 21


In [4]:
def aggregate_embeddings(X_raw):
    """
    Aggregate raw embeddings using mean, std, and max statistics.
    
    Args:
        X_raw: Object array where each element is (num_posts, embedding_dim)
    
    Returns:
        X_agg: Array of shape (n_subjects, embedding_dim * 3)
    """
    n_subjects = len(X_raw)
    embed_dim = X_raw[0].shape[1]  # Should be 384
    
    # Initialize aggregated features: mean, std, max
    X_agg = np.zeros((n_subjects, embed_dim * 3), dtype=np.float32)
    
    for i in range(n_subjects):
        posts_emb = np.array(X_raw[i], dtype=np.float32)  # (num_posts, embed_dim)
        
        # Compute statistics across posts (axis=0)
        mean_emb = np.mean(posts_emb, axis=0)  # (embed_dim,)
        std_emb = np.std(posts_emb, axis=0)    # (embed_dim,)
        max_emb = np.max(posts_emb, axis=0)    # (embed_dim,)
        
        # Concatenate into feature vector
        X_agg[i] = np.concatenate([mean_emb, std_emb, max_emb])
    
    return X_agg

print("Aggregation function defined")

Aggregation function defined


## Process Training Data

In [5]:
print("\n" + "="*60)
print("TRAINING DATA")
print("="*60)

# Load raw embeddings
train_data = np.load(train_raw, allow_pickle=True)
X_raw_train = train_data["X_raw"]
y_train = train_data["y"]
subject_ids_train = train_data["subject_ids"]

print(f"Loaded {len(y_train)} training subjects")
print(f"X_raw shape: {X_raw_train.shape}")
print(f"First subject raw embeddings: {X_raw_train[0].shape}")

# Aggregate embeddings
X_train = aggregate_embeddings(X_raw_train)
print(f"\nAggregated X shape: {X_train.shape}")

# Load concept labels from questionnaires
symptoms_df = pd.read_csv(concepts_path)
symptoms_df["subject_id"] = symptoms_df["Subject"].str.replace("train_", "", regex=True)

# Extract concept matrix
concept_cols = [c for c in symptoms_df.columns if c not in ["Subject", "Diagnosis", "subject_id"]]
C_train = symptoms_df[concept_cols].astype(float).values

print(f"Concept matrix shape: {C_train.shape}")
print(f"Label distribution: {np.bincount(y_train)}")

# Save
np.savez(
    train_output,
    X=X_train,
    C=C_train.astype(np.float32),
    y=y_train.astype(np.int64),
    subject_ids=subject_ids_train,
    concept_names=concept_names
)

print(f"\nSaved to {train_output}")


TRAINING DATA
Loaded 486 training subjects
X_raw shape: (486, 20, 384)
First subject raw embeddings: (20, 384)

Aggregated X shape: (486, 1152)
Concept matrix shape: (486, 21)
Label distribution: [403  83]

Saved to /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/cem_input.npz


## Process Validation Data

In [6]:
print("\n" + "="*60)
print("VALIDATION DATA")
print("="*60)

# Load raw embeddings
val_data = np.load(val_raw, allow_pickle=True)
X_raw_val = val_data["X_raw"]
y_val = val_data["y"]
subject_ids_val = val_data["subject_ids"]

print(f"Loaded {len(y_val)} validation subjects")
print(f"X_raw shape: {X_raw_val.shape}")

# Aggregate embeddings
X_val = aggregate_embeddings(X_raw_val)
print(f"Aggregated X shape: {X_val.shape}")

# Create zero concept matrix (no ground-truth concepts for val/test)
C_val = np.zeros((len(y_val), n_concepts), dtype=np.float32)
print(f"Concept matrix (zeros) shape: {C_val.shape}")
print(f"Label distribution: {np.bincount(y_val)}")

# Save
np.savez(
    val_output,
    X=X_val,
    C=C_val,
    y=y_val.astype(np.int64),
    subject_ids=subject_ids_val,
    concept_names=concept_names
)

print(f"\nSaved to {val_output}")


VALIDATION DATA


FileNotFoundError: [Errno 2] No such file or directory: '/Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/subject_features_raw_validation.npz'

## Process Test Data

In [None]:
print("\n" + "="*60)
print("TEST DATA")
print("="*60)

# Load raw embeddings
test_data = np.load(test_raw, allow_pickle=True)
X_raw_test = test_data["X_raw"]
y_test = test_data["y"]
subject_ids_test = test_data["subject_ids"]

print(f"Loaded {len(y_test)} test subjects")
print(f"X_raw shape: {X_raw_test.shape}")

# Aggregate embeddings
X_test = aggregate_embeddings(X_raw_test)
print(f"Aggregated X shape: {X_test.shape}")

# Create zero concept matrix
C_test = np.zeros((len(y_test), n_concepts), dtype=np.float32)
print(f"Concept matrix (zeros) shape: {C_test.shape}")
print(f"Label distribution: {np.bincount(y_test)}")

# Save
np.savez(
    test_output,
    X=X_test,
    C=C_test,
    y=y_test.astype(np.int64),
    subject_ids=subject_ids_test,
    concept_names=concept_names
)

print(f"\nSaved to {test_output}")

In [None]:
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

print("\nCreated aggregated datasets for regular CEM model:")
print(f"\nTraining: {train_output}")
print(f"  - Subjects: {len(y_train)}")
print(f"  - X shape: {X_train.shape}")
print(f"  - Features: mean + std + max of embeddings")
print(f"  - Concepts: ground-truth from questionnaires")

print(f"\nValidation: {val_output}")
print(f"  - Subjects: {len(y_val)}")
print(f"  - X shape: {X_val.shape}")
print(f"  - Concepts: all zeros (no ground-truth)")

print(f"\nTest: {test_output}")
print(f"  - Subjects: {len(y_test)}")
print(f"  - X shape: {X_test.shape}")
print(f"  - Concepts: all zeros (no ground-truth)")

print("\nReady to train regular CEM model with proper splits!")