# Build Test Manifest

This notebook processes the test data from `/data/raw/test/` and creates validation and test sets.

Steps:
1. Extract ZIP files containing test subject XMLs
2. Load test_golden_truth.txt for labels
3. Build manifest of test subjects
4. Extract all posts from XMLs
5. Retrieve top-20 posts per subject using concept-embedding similarity
6. Split 50/50 into validation and test sets (stratified)
7. Save two CSV files

In [1]:
import os
import glob
import re
import zipfile
import tempfile
import shutil
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Paths
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
test_dir = os.path.join(project_root, "data/raw/test")
golden_truth_path = os.path.join(test_dir, "test_golden_truth.txt")
output_dir = os.path.join(project_root, "data/processed")

print(f"Test directory: {test_dir}")
print(f"Golden truth: {golden_truth_path}")
print(f"Output directory: {output_dir}")

Test directory: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/raw/test
Golden truth: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/raw/test/test_golden_truth.txt
Output directory: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed


In [3]:
# Extract ZIP files to temporary directory
temp_dir = tempfile.mkdtemp(prefix="test_chunks_")
print(f"Extracting test chunks to: {temp_dir}")

for i in range(1, 11):
    zip_path = os.path.join(test_dir, f"chunk {i}.zip")
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(temp_dir, f"chunk_{i}"))
        print(f"Extracted chunk {i}")
    else:
        print(f"Warning: {zip_path} not found")

print(f"\nExtraction complete. Temporary directory: {temp_dir}")

Extracting test chunks to: /var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000gn/T/test_chunks_wgvdks_5
Extracted chunk 1
Extracted chunk 2
Extracted chunk 3
Extracted chunk 4
Extracted chunk 5
Extracted chunk 6
Extracted chunk 7
Extracted chunk 8
Extracted chunk 9
Extracted chunk 10

Extraction complete. Temporary directory: /var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000gn/T/test_chunks_wgvdks_5


In [4]:
# Load golden truth labels (TAB-separated)
labels_df = pd.read_csv(golden_truth_path, sep='\t', header=None, names=['subject_id', 'label'])

# Strip whitespace from subject_id column (file has leading spaces)
labels_df['subject_id'] = labels_df['subject_id'].str.strip()

print(f"Loaded {len(labels_df)} test subjects")
print(f"\nLabel distribution:")
print(labels_df['label'].value_counts())
print(f"\nFirst few subjects:")
print(labels_df.head())

Loaded 401 test subjects

Label distribution:
label
0    349
1     52
Name: count, dtype: int64

First few subjects:
         subject_id  label
0  test_subject9942      1
1  test_subject3986      1
2  test_subject6794      1
3  test_subject8969      1
4  test_subject3988      1


In [5]:
# Helper functions (reused from training notebook)
WHITESPACE_RE = re.compile(r"\s+")

def _normalize_text(t: str) -> str:
    t = t or ""
    t = t.replace("\u0000", "")
    t = WHITESPACE_RE.sub(" ", t).strip()
    return t

def extract_texts_from_xml(path, min_chars=10):
    """
    Given one subject chunk XML, extract posts.
    Each <WRITING> becomes a post: TITLE + TEXT (concatenated).
    Returns a list of strings.
    """
    try:
        tree = ET.parse(path)
        root = tree.getroot()
    except Exception as e:
        print(f"[XML-Parse-Error] {path}: {e}")
        return []

    posts = []
    for writing in root.findall("WRITING"):
        title = writing.findtext("TITLE") or ""
        text  = writing.findtext("TEXT") or ""

        combined = _normalize_text(f"{title} {text}".strip())
        if len(combined) >= min_chars:
            posts.append(combined)

    return posts

print("Helper functions loaded")

Helper functions loaded


In [6]:
# Build manifest: find all XML files for each test subject
manifest = {}

# Find all XML files in temporary directory
pattern = os.path.join(temp_dir, "**", "*.xml")
all_files = glob.glob(pattern, recursive=True)

print(f"Found {len(all_files)} XML files")

for filepath in all_files:
    filename = os.path.basename(filepath)
    # Extract subject ID from filename (e.g., "test_subject1005_1.xml")
    match = re.match(r"(test_subject\d+)_\d+\.xml", filename)
    if match:
        subject_id = match.group(1)
        if subject_id not in manifest:
            manifest[subject_id] = []
        manifest[subject_id].append(filepath)

print(f"Found {len(manifest)} unique subjects in XML files")
print(f"Labels file has {len(labels_df)} subjects")

# Build DataFrame
manifest_data = []
subjects_without_labels = []

for subject_id, files in manifest.items():
    # Get label from labels_df
    label_row = labels_df[labels_df['subject_id'] == subject_id]
    if len(label_row) > 0:
        label = label_row.iloc[0]['label']
        manifest_data.append({
            "subject_id": subject_id,
            "chunks": sorted(files),
            "label": label
        })
    else:
        subjects_without_labels.append(subject_id)

manifest_df = pd.DataFrame(manifest_data)

print(f"\nBuilt manifest for {len(manifest_df)} subjects")
if subjects_without_labels:
    print(f"Warning: {len(subjects_without_labels)} subjects in XML files but not in labels file:")
    print(f"  First few: {subjects_without_labels[:5]}")

print(f"\nLabel distribution in manifest:")
print(manifest_df['label'].value_counts())
print(f"\nFirst few rows:")
print(manifest_df.head())

Found 4010 XML files
Found 401 unique subjects in XML files
Labels file has 401 subjects

Built manifest for 401 subjects

Label distribution in manifest:
label
0    349
1     52
Name: count, dtype: int64

First few rows:
         subject_id                                             chunks  label
0  test_subject3081  [/var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000g...      0
1  test_subject2751  [/var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000g...      0
2  test_subject6974  [/var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000g...      0
3   test_subject954  [/var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000g...      0
4  test_subject4471  [/var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000g...      1


In [7]:
# Extract all posts from XMLs
def explode_manifest_to_posts(df_manifest):
    """
    Expand manifest DataFrame into a DataFrame of posts.
    Each row = one post with subject_id, label, text.
    """
    rows = []

    for _, row in df_manifest.iterrows():
        subject_id = row["subject_id"]
        label = row["label"]
        chunk_paths = row["chunks"]

        for file_path in chunk_paths:
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
            except Exception as e:
                print(f"[XML-Parse-Error] {file_path}: {e}")
                continue

            for writing in root.findall("WRITING"):
                title = (writing.findtext("TITLE") or "").strip()
                text = (writing.findtext("TEXT") or "").strip()

                full_text = f"{title}\n{text}" if title else text
                full_text = full_text.strip()

                if full_text:
                    rows.append({
                        "subject_id": subject_id,
                        "label": label,
                        "text": full_text
                    })

    return pd.DataFrame(rows)

posts_df = explode_manifest_to_posts(manifest_df)
print(f"Extracted {len(posts_df)} posts from {posts_df['subject_id'].nunique()} subjects")
print(f"\nFirst few posts:")
print(posts_df.head())

Extracted 236326 posts from 401 subjects

First few posts:
         subject_id  label                                               text
0  test_subject3081      0                                          Damn it..
1  test_subject3081      0                    Easily my favorite song by Tool
2  test_subject3081      0  Heard a rumor that they are in the studio and ...
3  test_subject3081      0                      You can never be too prepared
4  test_subject3081      0                      House of Pies in Houston, Tx?


In [8]:
# Load SBERT model
import torch

MODEL_NAME = "all-MiniLM-L6-v2"
sbert_model = SentenceTransformer(MODEL_NAME)

# Check if GPU is available and move model to GPU
if torch.backends.mps.is_available():
    device = "mps"
    print("✓ Using MacBook GPU (MPS)")
elif torch.cuda.is_available():
    device = "cuda"
    print("✓ Using CUDA GPU")
else:
    device = "cpu"
    print("⚠ Using CPU (this will be slow)")

# Move model to device
sbert_model = sbert_model.to(device)
print(f"SBERT model device: {sbert_model.device}")
print(f"Loaded SBERT model: {MODEL_NAME}")

✓ Using MacBook GPU (MPS)
SBERT model device: mps:0
Loaded SBERT model: all-MiniLM-L6-v2


In [9]:
# Create concept embeddings (same 21 BDI-II concepts as training)
concept_names = [
    "Sadness", "Pessimism", "Past failure", "Loss of pleasure",
    "Guilty feelings", "Punishment feelings", "Self-dislike", "Self-criticalness",
    "Suicidal thoughts or wishes", "Crying", "Agitation", "Loss of interest",
    "Indecisiveness", "Worthlessness", "Loss of energy", "Changes in sleeping pattern",
    "Irritability", "Changes in appetite", "Concentration difficulty",
    "Tiredness or fatigue", "Loss of interest in sex"
]

print(f"Creating embeddings for {len(concept_names)} concepts...")
concept_embeddings = sbert_model.encode(concept_names, convert_to_tensor=True)
print(f"Concept embeddings shape: {concept_embeddings.shape}")

Creating embeddings for 21 concepts...
Concept embeddings shape: torch.Size([21, 384])


In [10]:
def retrieve_posts_for_subject_concept_sim(subject_id, posts_df, concept_embeddings, k=20):
    """
    Retrieve top-k posts for a subject using concept-embedding similarity.
    
    This function:
    1. Embeds all posts for the subject
    2. Computes cosine similarity between each post and ALL 21 concept embeddings
    3. For each post, takes the max similarity across all concepts as the relevance score
    4. Returns top-k posts with highest concept-relevance scores
    """
    # Get subject's posts
    subj_posts = posts_df[posts_df["subject_id"] == subject_id]["text"].tolist()
    
    if len(subj_posts) == 0:
        return []
    
    # If subject has fewer posts than k, return all posts with padding
    if len(subj_posts) <= k:
        if len(subj_posts) < k:
            extra_needed = k - len(subj_posts)
            padding = list(np.random.choice(subj_posts, size=extra_needed, replace=True))
            return subj_posts + padding
        else:
            return subj_posts
    
    # Embed all subject's posts
    post_embeddings = sbert_model.encode(subj_posts, convert_to_tensor=True)
    
    # Compute cosine similarity: [num_posts, num_concepts]
    cos_scores = util.cos_sim(post_embeddings, concept_embeddings)
    
    # For each post, take the maximum similarity across all concepts
    max_sim_scores = cos_scores.max(dim=1).values.cpu().numpy()
    
    # Select top-k posts by relevance score
    top_k_indices = np.argpartition(-max_sim_scores, range(min(k, len(subj_posts))))[:k]
    
    selected_posts = [subj_posts[i] for i in top_k_indices]
    
    return selected_posts

print("Retrieval function defined")

Retrieval function defined


In [11]:
# Retrieve top-20 posts for each subject
retrieved_data = []

print("Retrieving posts using concept-embedding similarity...")
print(f"Processing {len(manifest_df)} test subjects...")

import time
start_time = time.time()

for idx, row in manifest_df.iterrows():
    subject_id = row["subject_id"]
    label = row["label"]
    
    # Progress indicator every 50 subjects
    if (idx + 1) % 50 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / (idx + 1)
        remaining = avg_time * (len(manifest_df) - idx - 1)
        print(f"  Processed {idx + 1}/{len(manifest_df)} subjects "
              f"({elapsed/60:.1f}min elapsed, ~{remaining/60:.1f}min remaining)")
    
    posts = retrieve_posts_for_subject_concept_sim(
        subject_id, 
        posts_df, 
        concept_embeddings, 
        k=20
    )
    
    for p in posts:
        retrieved_data.append({
            "subject_id": subject_id,
            "label": label,
            "text": p
        })

total_time = time.time() - start_time
print(f"\nCompleted in {total_time/60:.1f} minutes")

retrieved_df = pd.DataFrame(retrieved_data)
print(f"\nResults:")
print(f"  Total retrieved posts: {len(retrieved_df)}")
print(f"  Unique subjects: {retrieved_df['subject_id'].nunique()}")
print(f"  Posts per subject: {len(retrieved_df) / retrieved_df['subject_id'].nunique():.1f}")
print(f"\nFirst few rows:")
print(retrieved_df.head())

Retrieving posts using concept-embedding similarity...
Processing 401 test subjects...
  Processed 50/401 subjects (1.3min elapsed, ~9.2min remaining)
  Processed 100/401 subjects (2.8min elapsed, ~8.6min remaining)
  Processed 150/401 subjects (4.5min elapsed, ~7.5min remaining)
  Processed 200/401 subjects (6.2min elapsed, ~6.2min remaining)
  Processed 250/401 subjects (8.3min elapsed, ~5.0min remaining)
  Processed 300/401 subjects (10.8min elapsed, ~3.7min remaining)
  Processed 350/401 subjects (13.1min elapsed, ~1.9min remaining)
  Processed 400/401 subjects (15.1min elapsed, ~0.0min remaining)

Completed in 15.2 minutes

Results:
  Total retrieved posts: 8020
  Unique subjects: 401
  Posts per subject: 20.0

First few rows:
         subject_id  label                                               text
0  test_subject3081      0             You can't blame him for being hopeful.
1  test_subject3081      0                 Worst band ever. Rivals Nickelback
2  test_subject3081     

In [12]:
# Split 50/50 into validation and test (stratified by label)
# First, get unique subjects with their labels
subject_labels = retrieved_df.groupby('subject_id')['label'].first().reset_index()

# Stratified split
val_subjects, test_subjects = train_test_split(
    subject_labels['subject_id'],
    test_size=0.5,
    stratify=subject_labels['label'],
    random_state=42
)

# Create validation and test dataframes
val_df = retrieved_df[retrieved_df['subject_id'].isin(val_subjects)].copy()
test_df = retrieved_df[retrieved_df['subject_id'].isin(test_subjects)].copy()

print(f"Validation set: {val_df['subject_id'].nunique()} subjects, {len(val_df)} posts")
print(f"Test set: {test_df['subject_id'].nunique()} subjects, {len(test_df)} posts")

print(f"\nValidation label distribution:")
print(val_df.groupby('label')['subject_id'].nunique())

print(f"\nTest label distribution:")
print(test_df.groupby('label')['subject_id'].nunique())

Validation set: 200 subjects, 4000 posts
Test set: 201 subjects, 4020 posts

Validation label distribution:
label
0    174
1     26
Name: subject_id, dtype: int64

Test label distribution:
label
0    175
1     26
Name: subject_id, dtype: int64


In [13]:
# Save validation and test sets
val_path = os.path.join(output_dir, "retrieved_test_validation.csv")
test_path = os.path.join(output_dir, "retrieved_test_test.csv")

val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Saved validation set to: {val_path}")
print(f"Saved test set to: {test_path}")

Saved validation set to: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/retrieved_test_validation.csv
Saved test set to: /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/retrieved_test_test.csv


In [14]:
# Clean up temporary directory
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary directory: {temp_dir}")
print("\nDone!")

Cleaned up temporary directory: /var/folders/gb/m6c_r5xx6_14p7mlfjwk29900000gn/T/test_chunks_wgvdks_5

Done!
