# Prepare Attention Data for Validation and Test Sets

This notebook prepares the final aligned data for validation and test sets.

Since validation/test subjects don't have ground-truth concept labels, we create zero concept matrices.

Input:
- `subject_features_raw_validation.npz` - Validation embeddings
- `subject_features_raw_test.npz` - Test embeddings

Output:
- `cem_input_raw_validation.npz` - Validation aligned data (X_raw, C=zeros, y, subject_ids, concept_names)
- `cem_input_raw_test.npz` - Test aligned data (X_raw, C=zeros, y, subject_ids, concept_names)

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
# Paths
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data/processed")

val_features = os.path.join(data_dir, "subject_features_raw_validation.npz")
test_features = os.path.join(data_dir, "subject_features_raw_test.npz")

val_output = os.path.join(data_dir, "cem_input_raw_validation.npz")
test_output = os.path.join(data_dir, "cem_input_raw_test.npz")

print(f"Validation features: {val_features}")
print(f"Test features: {test_features}")
print(f"Output directory: {data_dir}")

In [None]:
# Define concept names (same 21 BDI-II concepts as training)
concept_names = np.array([
    "Sadness", "Pessimism", "Past failure", "Loss of pleasure",
    "Guilty feelings", "Punishment feelings", "Self-dislike", "Self-criticalness",
    "Suicidal thoughts or wishes", "Crying", "Agitation", "Loss of interest",
    "Indecisiveness", "Worthlessness", "Loss of energy", "Changes in sleeping pattern",
    "Irritability", "Changes in appetite", "Concentration difficulty",
    "Tiredness or fatigue", "Loss of interest in sex"
])

n_concepts = len(concept_names)
print(f"Number of concepts: {n_concepts}")

In [None]:
def prepare_cem_data(features_path, output_path, dataset_name):
    """
    Prepare CEM input data with zero concept matrix.
    
    Since val/test subjects don't have ground-truth concept labels,
    we create an all-zero concept matrix.
    """
    print(f"\n{'='*60}")
    print(f"Processing {dataset_name}")
    print(f"{'='*60}")
    
    # Load features
    if not os.path.exists(features_path):
        print(f"Error: Input file not found at {features_path}")
        return
    
    features = np.load(features_path, allow_pickle=True)
    X_raw = features["X_raw"]
    y = features["y"]
    subject_ids = features["subject_ids"]
    
    n_subjects = len(y)
    
    print(f"Loaded {n_subjects} subjects")
    print(f"X_raw shape: {X_raw.shape}")
    print(f"y shape: {y.shape}")
    print(f"Label distribution: {np.bincount(y)}")
    
    # Create zero concept matrix
    # Shape: (n_subjects, n_concepts)
    C = np.zeros((n_subjects, n_concepts), dtype=np.float32)
    
    print(f"\nCreated zero concept matrix: {C.shape}")
    print(f"  Rationale: Val/test subjects don't have ground-truth concept labels")
    print(f"  The model will learn to predict from embeddings primarily")
    
    # Convert X_raw to float32 to avoid object dtype issues
    X_raw_float = X_raw.astype(np.float32)
    print(f"\nConverted X_raw to dtype: {X_raw_float.dtype}")
    
    # Save
    np.savez(
        output_path,
        X_raw=X_raw_float,
        C=C,
        y=y.astype(np.int64),
        subject_ids=subject_ids,
        concept_names=concept_names
    )
    
    print(f"\nSaved dataset to {output_path}")
    print(f"Done with {dataset_name}!\n")

print("Prepare function defined")

In [None]:
# Process validation set
prepare_cem_data(val_features, val_output, "VALIDATION")

In [None]:
# Process test set
prepare_cem_data(test_features, test_output, "TEST")

In [None]:
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

# Load and verify
if os.path.exists(val_output):
    val_data = np.load(val_output, allow_pickle=True)
    print(f"\nValidation set:")
    print(f"  - Subjects: {len(val_data['y'])}")
    print(f"  - X_raw shape: {val_data['X_raw'].shape}")
    print(f"  - C (concepts) shape: {val_data['C'].shape}")
    print(f"  - C sum (should be 0): {val_data['C'].sum()}")
    print(f"  - y shape: {val_data['y'].shape}")
    print(f"  - Concept names: {len(val_data['concept_names'])}")

if os.path.exists(test_output):
    test_data = np.load(test_output, allow_pickle=True)
    print(f"\nTest set:")
    print(f"  - Subjects: {len(test_data['y'])}")
    print(f"  - X_raw shape: {test_data['X_raw'].shape}")
    print(f"  - C (concepts) shape: {test_data['C'].shape}")
    print(f"  - C sum (should be 0): {test_data['C'].sum()}")
    print(f"  - y shape: {test_data['y'].shape}")
    print(f"  - Concept names: {len(test_data['concept_names'])}")

print("\nAll aligned data created successfully!")
print("\nReady for training with proper train/val/test splits!")