# Dataset Preparation Pipeline

**Purpose:** Prepare the dataset once and save it for training different models.

**Runtime:** ~40-50 minutes (mostly SBERT encoding and retrieval)

This notebook:
1. Loads training and test data from XML files
2. Uses SBERT to retrieve top-20 concept-relevant posts per subject
3. Averages post embeddings into single vectors
4. Saves everything to `data/processed/whole_pipeline/`

**Run this ONCE**, then use `complete_cem_pipeline.ipynb` or `complete_cbm_pipeline.ipynb` for training.

## Section 0: Configuration & Setup

In [1]:
# Imports
import os
import glob
import re
import zipfile
import tempfile
import shutil
import json
import time

import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split

print("✓ All imports successful")

  from tqdm.autonotebook import tqdm, trange


✓ All imports successful


In [2]:
# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"✓ Random seed set to {SEED}")

✓ Random seed set to 42


In [3]:
# Detect device (MPS/CUDA/CPU)
if torch.backends.mps.is_available():
    DEVICE = "mps"
    print("✓ Using MacBook GPU (MPS)")
elif torch.cuda.is_available():
    DEVICE = "cuda"
    print("✓ Using CUDA GPU")
else:
    DEVICE = "cpu"
    print("⚠ Using CPU (will be slow)")

✓ Using MacBook GPU (MPS)


In [4]:
# Define paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_RAW = os.path.join(PROJECT_ROOT, "data/raw")
DATA_PROCESSED = os.path.join(PROJECT_ROOT, "data/processed")

# Training data paths
POS_DIR = os.path.join(DATA_RAW, "train/positive_examples_anonymous_chunks")
NEG_DIR = os.path.join(DATA_RAW, "train/negative_examples_anonymous_chunks")

# Test data paths
TEST_DIR = os.path.join(DATA_RAW, "test")
TEST_LABELS = os.path.join(TEST_DIR, "test_golden_truth.txt")

# Concept labels
CONCEPTS_FILE = os.path.join(DATA_PROCESSED, "merged_questionnaires.csv")

# Output directory
SAVE_DIR = os.path.join(DATA_PROCESSED, "whole_attention_pipeline")
os.makedirs(SAVE_DIR, exist_ok=True)

print("✓ Paths configured")
print(f"  Project root: {PROJECT_ROOT}")
print(f"  Save directory: {SAVE_DIR}")

✓ Paths configured
  Project root: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study
  Save directory: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/whole_attention_pipeline


In [5]:
# Define 21 BDI-II concept names
CONCEPT_NAMES = [
    "Sadness", "Pessimism", "Past failure", "Loss of pleasure",
    "Guilty feelings", "Punishment feelings", "Self-dislike", "Self-criticalness",
    "Suicidal thoughts or wishes", "Crying", "Agitation", "Loss of interest",
    "Indecisiveness", "Worthlessness", "Loss of energy", "Changes in sleeping pattern",
    "Irritability", "Changes in appetite", "Concentration difficulty",
    "Tiredness or fatigue", "Loss of interest in sex"
]
N_CONCEPTS = len(CONCEPT_NAMES)

print(f"✓ Defined {N_CONCEPTS} BDI-II concepts")

✓ Defined 21 BDI-II concepts


In [6]:
# Hyperparameters
HYPERPARAMS = {
    "k_posts": 50,              # Top-k posts per subject
    "sbert_model": "all-MiniLM-L6-v2",
    "embedding_dim": 384,
}

print("✓ Hyperparameters configured:")
for k, v in HYPERPARAMS.items():
    print(f"  {k}: {v}")

✓ Hyperparameters configured:
  k_posts: 50
  sbert_model: all-MiniLM-L6-v2
  embedding_dim: 384


## Section 1: Load Training Data

Extract 486 training subjects with posts and concept labels

In [7]:
# Helper functions for XML parsing
WHITESPACE_RE = re.compile(r"\s+")

def normalize_text(text):
    """Normalize text by removing null chars and extra whitespace."""
    if not text:
        return ""
    text = text.replace("\u0000", "")
    text = WHITESPACE_RE.sub(" ", text).strip()
    return text

def extract_posts_from_xml(xml_path, min_chars=10):
    """Extract posts from a single XML file."""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
    except Exception as e:
        print(f"WARNING: Failed to parse {xml_path}: {e}")
        return []
    
    posts = []
    for writing in root.findall("WRITING"):
        title = writing.findtext("TITLE") or ""
        text = writing.findtext("TEXT") or ""
        
        combined = normalize_text(f"{title} {text}".strip())
        if len(combined) >= min_chars:
            posts.append(combined)
    
    return posts

print("✓ Helper functions defined")

✓ Helper functions defined


In [8]:
# Parse training XML files
print("Loading training data...")
start_time = time.time()

train_data = []

# Process positive examples
print("  Processing positive examples...")
pos_files = glob.glob(os.path.join(POS_DIR, "**", "*.xml"), recursive=True)
for xml_file in pos_files:
    filename = os.path.basename(xml_file)
    match = re.match(r"train_(subject\d+)_\d+\.xml", filename)
    if match:
        subject_id = match.group(1)
        posts = extract_posts_from_xml(xml_file)
        for post in posts:
            train_data.append({
                "subject_id": subject_id,
                "label": 1,  # Positive (depression)
                "text": post
            })

print(f"  Loaded {len([d for d in train_data if d['label']==1])} posts from positive subjects")

# Process negative examples
print("  Processing negative examples...")
neg_files = glob.glob(os.path.join(NEG_DIR, "**", "*.xml"), recursive=True)
for xml_file in neg_files:
    filename = os.path.basename(xml_file)
    match = re.match(r"train_(subject\d+)_\d+\.xml", filename)
    if match:
        subject_id = match.group(1)
        posts = extract_posts_from_xml(xml_file)
        for post in posts:
            train_data.append({
                "subject_id": subject_id,
                "label": 0,  # Negative (control)
                "text": post
            })

train_posts_df = pd.DataFrame(train_data)

print(f"\n✓ Loaded training data in {time.time()-start_time:.1f}s")
print(f"  Total posts: {len(train_posts_df):,}")
print(f"  Unique subjects: {train_posts_df['subject_id'].nunique()}")
print(f"  Label distribution:")
print(train_posts_df.groupby('label')['subject_id'].nunique())

Loading training data...
  Processing positive examples...
  Loaded 29868 posts from positive subjects
  Processing negative examples...

✓ Loaded training data in 2.8s
  Total posts: 286,740
  Unique subjects: 486
  Label distribution:
label
0    403
1     83
Name: subject_id, dtype: int64


In [9]:
# Load concept labels from questionnaires
print("Loading concept labels...")

concepts_df = pd.read_csv(CONCEPTS_FILE)
concepts_df["subject_id"] = concepts_df["Subject"].str.replace("train_", "", regex=True)

# Binarize concept values
concept_cols = [col for col in concepts_df.columns if col in CONCEPT_NAMES]
for col in concept_cols:
    concepts_df[col] = (concepts_df[col] > 0).astype(int)

print(f"✓ Loaded concept labels for {len(concepts_df)} subjects")

Loading concept labels...
✓ Loaded concept labels for 486 subjects


## Section 2: Load Test Data

Extract 401 test subjects and split into validation (200) and test (201)

In [10]:
# Extract test ZIP files to temporary directory
print("Extracting test data...")
temp_dir = tempfile.mkdtemp(prefix="test_chunks_")
print(f"  Temp directory: {temp_dir}")

for i in range(1, 11):
    zip_path = os.path.join(TEST_DIR, f"chunk {i}.zip")
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(temp_dir, f"chunk_{i}"))
        if i % 3 == 0:
            print(f"  Extracted chunk {i}/10")

print("✓ Test data extracted")

Extracting test data...
  Temp directory: /var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000gn/T/test_chunks_ym1pt74r
  Extracted chunk 3/10
  Extracted chunk 6/10
  Extracted chunk 9/10
✓ Test data extracted


In [11]:
# Load test labels
test_labels_df = pd.read_csv(TEST_LABELS, sep='\t', header=None, names=['subject_id', 'label'])
test_labels_df['subject_id'] = test_labels_df['subject_id'].str.strip()

print(f"✓ Loaded test labels for {len(test_labels_df)} subjects")
print(f"  Label distribution:")
print(test_labels_df['label'].value_counts())

✓ Loaded test labels for 401 subjects
  Label distribution:
label
0    349
1     52
Name: count, dtype: int64


In [12]:
# Parse test XML files
print("Loading test posts...")
test_data = []

test_xml_files = glob.glob(os.path.join(temp_dir, "**", "*.xml"), recursive=True)
print(f"  Found {len(test_xml_files)} XML files")

for xml_file in test_xml_files:
    filename = os.path.basename(xml_file)
    match = re.match(r"(test_subject\d+)_\d+\.xml", filename)
    if match:
        subject_id = match.group(1)
        label_row = test_labels_df[test_labels_df['subject_id'] == subject_id]
        if len(label_row) > 0:
            label = label_row.iloc[0]['label']
            posts = extract_posts_from_xml(xml_file)
            for post in posts:
                test_data.append({
                    "subject_id": subject_id,
                    "label": label,
                    "text": post
                })

test_posts_df = pd.DataFrame(test_data)

print(f"✓ Loaded test posts")
print(f"  Total posts: {len(test_posts_df):,}")
print(f"  Unique subjects: {test_posts_df['subject_id'].nunique()}")

Loading test posts...
  Found 4010 XML files
✓ Loaded test posts
  Total posts: 229,746
  Unique subjects: 401


In [13]:
# Split test data into validation and test sets (stratified 50/50)
print("Splitting test data into validation and test...")

test_subjects = test_posts_df.groupby('subject_id')['label'].first().reset_index()

val_subjects, test_subjects_final = train_test_split(
    test_subjects['subject_id'],
    test_size=0.5,
    stratify=test_subjects['label'],
    random_state=SEED
)

val_posts_df = test_posts_df[test_posts_df['subject_id'].isin(val_subjects)].copy()
test_posts_df_final = test_posts_df[test_posts_df['subject_id'].isin(test_subjects_final)].copy()

print(f"✓ Split complete")
print(f"  Validation: {val_posts_df['subject_id'].nunique()} subjects")
print(f"  Test: {test_posts_df_final['subject_id'].nunique()} subjects")

Splitting test data into validation and test...
✓ Split complete
  Validation: 200 subjects
  Test: 201 subjects


## Section 3: SBERT Setup & Concept Embeddings

In [14]:
# Load SBERT model
print(f"Loading SBERT model: {HYPERPARAMS['sbert_model']}")
sbert_model = SentenceTransformer(HYPERPARAMS['sbert_model'])
sbert_model = sbert_model.to(DEVICE)

print(f"✓ SBERT model loaded on {DEVICE}")
print(f"  Embedding dimension: {sbert_model.get_sentence_embedding_dimension()}")

Loading SBERT model: all-MiniLM-L6-v2
✓ SBERT model loaded on mps
  Embedding dimension: 384


In [15]:
# Create concept embeddings
print(f"Creating embeddings for {N_CONCEPTS} concepts...")
concept_embeddings = sbert_model.encode(
    CONCEPT_NAMES,
    convert_to_tensor=True,
    show_progress_bar=False
)

print(f"✓ Concept embeddings created")
print(f"  Shape: {concept_embeddings.shape}")

Creating embeddings for 21 concepts...
✓ Concept embeddings created
  Shape: torch.Size([21, 384])


## Section 4: Post Retrieval (Top-k per Subject)

**This is the slow part**

Select the k most concept-relevant posts for each subject

In [16]:
def retrieve_top_k_posts(subject_id, posts_df, concept_embs, sbert, k=50):
    """Retrieve top-k posts for a subject based on concept similarity."""
    subj_posts = posts_df[posts_df['subject_id'] == subject_id]['text'].tolist()
    
    if len(subj_posts) == 0:
        return []
    
    if len(subj_posts) <= k:
        if len(subj_posts) < k:
            extra_needed = k - len(subj_posts)
            padding = list(np.random.choice(subj_posts, size=extra_needed, replace=True))
            return subj_posts + padding
        else:
            return subj_posts
    
    post_embeddings = sbert.encode(
        subj_posts,
        convert_to_tensor=True,
        show_progress_bar=False
    )
    
    cos_scores = util.cos_sim(post_embeddings, concept_embs)
    max_sim_scores = cos_scores.max(dim=1).values.cpu().numpy()
    top_k_indices = np.argpartition(-max_sim_scores, range(min(k, len(subj_posts))))[:k]
    
    return [subj_posts[i] for i in top_k_indices]

print("✓ Post retrieval function defined")

✓ Post retrieval function defined


In [18]:
# Retrieve top-k posts for all subjects
print(f"Retrieving top-{HYPERPARAMS['k_posts']} posts for all subjects...")
print("⏰ This will take some time")
start_time = time.time()

# Training subjects
print("  Processing training subjects...")
train_selected = {}
train_subjects = train_posts_df['subject_id'].unique()
for idx, subject_id in enumerate(train_subjects):
    selected = retrieve_top_k_posts(
        subject_id, train_posts_df, concept_embeddings, sbert_model, k=HYPERPARAMS['k_posts']
    )
    train_selected[subject_id] = selected
    
    if (idx + 1) % 100 == 0:
        print(f"    Processed {idx + 1}/{len(train_subjects)} subjects")

# Validation subjects
print("  Processing validation subjects...")
val_selected = {}
val_subjects = val_posts_df['subject_id'].unique()
for idx, subject_id in enumerate(val_subjects):
    selected = retrieve_top_k_posts(
        subject_id, val_posts_df, concept_embeddings, sbert_model, k=HYPERPARAMS['k_posts']
    )
    val_selected[subject_id] = selected

# Test subjects
print("  Processing test subjects...")
test_selected = {}
test_subjects = test_posts_df_final['subject_id'].unique()
for idx, subject_id in enumerate(test_subjects):
    selected = retrieve_top_k_posts(
        subject_id, test_posts_df_final, concept_embeddings, sbert_model, k=HYPERPARAMS['k_posts']
    )
    test_selected[subject_id] = selected

print(f"\n✓ Post retrieval complete in {time.time()-start_time:.1f}s ({(time.time()-start_time)/60:.1f} min)")

Retrieving top-50 posts for all subjects...
⏰ This will take some time
  Processing training subjects...
    Processed 100/486 subjects
    Processed 200/486 subjects
    Processed 300/486 subjects
    Processed 400/486 subjects
  Processing validation subjects...
  Processing test subjects...

✓ Post retrieval complete in 2130.5s (35.5 min)


## Section 5: Embedding Aggregation (Attention Pooling)


In [19]:
def encode_and_attention_pool(selected_posts_dict, sbert, concept_embs):
    """
    Encode selected posts and aggregate them using concept-guided attention.
    
    Attention weight for each post is proportional to its max similarity
    to any concept embedding.
    """
    subject_ids = list(selected_posts_dict.keys())
    pooled_embeddings = []

    for subject_id in subject_ids:
        posts = selected_posts_dict[subject_id]

        # Encode posts
        post_embs = sbert.encode(
            posts,
            convert_to_tensor=True,
            show_progress_bar=False
        )  # shape: [k, d]

        # Compute similarity to concepts
        cos_scores = util.cos_sim(post_embs, concept_embs)  # [k, n_concepts]
        post_scores = cos_scores.max(dim=1).values          # [k]

        # Attention weights
        attn_weights = torch.softmax(post_scores, dim=0)    # [k]

        # Weighted sum
        pooled = torch.sum(attn_weights.unsqueeze(1) * post_embs, dim=0)
        pooled_embeddings.append(pooled.cpu().numpy())

    return np.vstack(pooled_embeddings), subject_ids


In [20]:
# Encode and average for all splits
print("Encoding and averaging embeddings...")

start_time = time.time()

print("  Training set...")
X_train, train_subject_ids = encode_and_attention_pool(train_selected, sbert_model, concept_embeddings)
print(f"    X_train shape: {X_train.shape}")

print("  Validation set...")
X_val, val_subject_ids = encode_and_attention_pool(val_selected, sbert_model, concept_embeddings)
print(f"    X_val shape: {X_val.shape}")

print("  Test set...")
X_test, test_subject_ids = encode_and_attention_pool(test_selected, sbert_model, concept_embeddings)
print(f"    X_test shape: {X_test.shape}")

print(f"\n✓ Encoding complete in {time.time()-start_time:.1f}s ({(time.time()-start_time)/60:.1f} min)")

Encoding and averaging embeddings...
  Training set...
    X_train shape: (486, 384)
  Validation set...
    X_val shape: (200, 384)
  Test set...
    X_test shape: (201, 384)

✓ Encoding complete in 339.0s (5.7 min)


## Section 6: Build Concept Matrices and Labels

In [21]:
# Build concept matrices and label vectors
print("Building concept matrices and labels...")

# Training: get concepts from questionnaires
C_train = []
y_train = []
for subject_id in train_subject_ids:
    label = train_posts_df[train_posts_df['subject_id'] == subject_id]['label'].iloc[0]
    y_train.append(label)
    
    concept_row = concepts_df[concepts_df['subject_id'] == subject_id]
    if len(concept_row) > 0:
        concepts = concept_row[concept_cols].values[0]
    else:
        concepts = np.zeros(N_CONCEPTS)
    C_train.append(concepts)

C_train = np.array(C_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)

# Validation: zeros for concepts (no ground truth)
C_val = np.zeros((len(val_subject_ids), N_CONCEPTS), dtype=np.float32)
y_val = []
for subject_id in val_subject_ids:
    label = val_posts_df[val_posts_df['subject_id'] == subject_id]['label'].iloc[0]
    y_val.append(label)
y_val = np.array(y_val, dtype=np.float32)

# Test: zeros for concepts
C_test = np.zeros((len(test_subject_ids), N_CONCEPTS), dtype=np.float32)
y_test = []
for subject_id in test_subject_ids:
    label = test_posts_df_final[test_posts_df_final['subject_id'] == subject_id]['label'].iloc[0]
    y_test.append(label)
y_test = np.array(y_test, dtype=np.float32)

print("✓ Matrices built")
print(f"  Train: X={X_train.shape}, C={C_train.shape}, y={y_train.shape}")
print(f"  Val:   X={X_val.shape}, C={C_val.shape}, y={y_val.shape}")
print(f"  Test:  X={X_test.shape}, C={C_test.shape}, y={y_test.shape}")
print(f"\n  Training label distribution: {np.bincount(y_train.astype(int))}")
print(f"  Validation label distribution: {np.bincount(y_val.astype(int))}")
print(f"  Test label distribution: {np.bincount(y_test.astype(int))}")

Building concept matrices and labels...
✓ Matrices built
  Train: X=(486, 384), C=(486, 21), y=(486,)
  Val:   X=(200, 384), C=(200, 21), y=(200,)
  Test:  X=(201, 384), C=(201, 21), y=(201,)

  Training label distribution: [403  83]
  Validation label distribution: [174  26]
  Test label distribution: [175  26]


## Section 7: Compute Class Weights

In [22]:
# Compute class weights for imbalanced dataset
n_negative = int(np.sum(y_train == 0))
n_positive = int(np.sum(y_train == 1))
pos_weight = n_negative / n_positive

print(f"Class imbalance:")
print(f"  Negative samples: {n_negative}")
print(f"  Positive samples: {n_positive}")
print(f"  Ratio: 1:{pos_weight:.2f}")
print(f"  Computed pos_weight: {pos_weight:.4f}")

Class imbalance:
  Negative samples: 403
  Positive samples: 83
  Ratio: 1:4.86
  Computed pos_weight: 4.8554


## Section 8: Save All Datasets

Save everything for fast loading by training pipelines

In [23]:
# Save processed datasets to disk
print("Saving datasets...")

# Save numpy arrays
np.savez_compressed(
    os.path.join(SAVE_DIR, "train_data.npz"),
    X=X_train,
    C=C_train,
    y=y_train,
    subject_ids=np.array(train_subject_ids)
)

np.savez_compressed(
    os.path.join(SAVE_DIR, "val_data.npz"),
    X=X_val,
    C=C_val,
    y=y_val,
    subject_ids=np.array(val_subject_ids)
)

np.savez_compressed(
    os.path.join(SAVE_DIR, "test_data.npz"),
    X=X_test,
    C=C_test,
    y=y_test,
    subject_ids=np.array(test_subject_ids)
)

# Save class weights info
class_info = {
    "n_positive": n_positive,
    "n_negative": n_negative,
    "pos_weight": float(pos_weight)
}

with open(os.path.join(SAVE_DIR, "class_weights.json"), 'w') as f:
    json.dump(class_info, f, indent=4)

print(f"✓ Datasets saved to {SAVE_DIR}")
print(f"  train_data.npz: {X_train.shape[0]} samples")
print(f"  val_data.npz:   {X_val.shape[0]} samples")
print(f"  test_data.npz:  {X_test.shape[0]} samples")
print(f"  class_weights.json")

Saving datasets...
✓ Datasets saved to /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/whole_attention_pipeline
  train_data.npz: 486 samples
  val_data.npz:   200 samples
  test_data.npz:  201 samples
  class_weights.json


## Section 9: Cleanup

In [24]:
# Clean up temporary directory
try:
    shutil.rmtree(temp_dir)
    print(f"✓ Cleaned up temporary directory: {temp_dir}")
except Exception as e:
    print(f"⚠ Failed to clean up temporary directory: {e}")

✓ Cleaned up temporary directory: /var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000gn/T/test_chunks_ym1pt74r


In [25]:
print("\n" + "="*70)
print("          DATASET PREPARATION COMPLETE")
print("="*70)
print("\nSaved files:")
print(f"  {SAVE_DIR}/train_data.npz")
print(f"  {SAVE_DIR}/val_data.npz")
print(f"  {SAVE_DIR}/test_data.npz")
print(f"  {SAVE_DIR}/class_weights.json")
print("\nYou can now use:")
print("  - complete_cem_pipeline.ipynb")
print("  - complete_cbm_pipeline.ipynb")
print("\nto train models without re-running this long preprocessing!")
print("="*70)


          DATASET PREPARATION COMPLETE

Saved files:
  /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/whole_attention_pipeline/train_data.npz
  /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/whole_attention_pipeline/val_data.npz
  /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/whole_attention_pipeline/test_data.npz
  /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/whole_attention_pipeline/class_weights.json

You can now use:
  - complete_cem_pipeline.ipynb
  - complete_cbm_pipeline.ipynb

to train models without re-running this long preprocessing!
