# Questionnaire Loader Debug Notebook

This notebook allows manual inspection and debugging of the QuestionnaireLoader.

## What it loads:
- Demographics: Age, Gender, Height, Weight, etc.
- Mental Health: OASIS, PCL-5, PHQ9, GAD7, Depression, Anxiety
- Personality: Big 5 traits (Extraversion, Agreeableness, etc.)
- Lifestyle: Exercise, Caffeine, Sleep (PSQI), Screen time
- Socioeconomic: Education, Work status, Salary

In [None]:
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np

# Load environment variables
load_dotenv()

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 1. Configuration

Set up paths - questionnaire data is typically a standalone CSV file.

In [None]:
# Set the questionnaire CSV path
# This is typically the IntegratedQ export file
QUESTIONNAIRE_CSV = os.getenv("QUESTIONNAIRE_CSV", None)
SESSIONS_CSV = os.getenv("SESSIONS_CSV", None)

# Optionally override paths here:
# QUESTIONNAIRE_CSV = "/path/to/Qcenter_-_IntegratedQ.csv"
# SESSIONS_CSV = "/path/to/sessions.csv"

print("Configuration:")
print(f"  QUESTIONNAIRE_CSV: {QUESTIONNAIRE_CSV}")
print(f"  SESSIONS_CSV: {SESSIONS_CSV}")

## 2. Verify Paths Exist

In [None]:
def check_path(path, name):
    if path is None:
        print(f"  {name}: NOT SET")
        return False
    p = Path(path)
    exists = p.exists()
    print(f"  {name}: {'EXISTS' if exists else 'MISSING'} - {p}")
    return exists

print("Path verification:")
questionnaire_exists = check_path(QUESTIONNAIRE_CSV, "QUESTIONNAIRE_CSV")
sessions_exists = check_path(SESSIONS_CSV, "SESSIONS_CSV")

## 3. Inspect Raw CSV Structure

In [None]:
# Load raw CSV to inspect structure
if QUESTIONNAIRE_CSV and Path(QUESTIONNAIRE_CSV).exists():
    raw_df = pd.read_csv(QUESTIONNAIRE_CSV)
    print(f"Raw questionnaire CSV loaded: {len(raw_df)} rows, {len(raw_df.columns)} columns")
    print(f"\nColumn names ({len(raw_df.columns)} total):")
    for i, col in enumerate(raw_df.columns):
        print(f"  {i+1:3}. {col}")
else:
    raw_df = None
    print("Questionnaire CSV not found")

In [None]:
# Show first few rows
if raw_df is not None:
    print("First 5 rows:")
    display(raw_df.head())

In [None]:
# Check data types
if raw_df is not None:
    print("Data types:")
    print(raw_df.dtypes.to_string())

## 4. Initialize QuestionnaireLoader

In [None]:
from neuroalign_preprocessing.loaders import QuestionnaireLoader

if QUESTIONNAIRE_CSV and Path(QUESTIONNAIRE_CSV).exists():
    loader = QuestionnaireLoader(QUESTIONNAIRE_CSV)
    print(f"QuestionnaireLoader initialized")
    print(f"  Path: {loader.questionnaire_path}")
else:
    loader = None
    print("Cannot initialize loader - CSV not found")

## 5. Load and Clean Data

In [None]:
if loader:
    df = loader.load(
        clean=True,
        standardize_subject_codes=True
    )
    
    print(f"Loaded and cleaned: {len(df)} rows")
    print(f"\nSubject Code format check (first 10):")
    print(df['Subject Code'].head(10).tolist())

## 6. Explore Feature Categories

In [None]:
# Show defined feature categories
if loader:
    print("Feature categories defined in loader:")
    print(f"\nDEMOGRAPHIC_FEATURES ({len(loader.DEMOGRAPHIC_FEATURES)}):")
    for f in loader.DEMOGRAPHIC_FEATURES:
        exists = f in df.columns if df is not None else False
        print(f"  {'[x]' if exists else '[ ]'} {f}")
    
    print(f"\nMENTAL_HEALTH_FEATURES ({len(loader.MENTAL_HEALTH_FEATURES)}):")
    for f in loader.MENTAL_HEALTH_FEATURES:
        exists = f in df.columns if df is not None else False
        print(f"  {'[x]' if exists else '[ ]'} {f}")
    
    print(f"\nPERSONALITY_FEATURES ({len(loader.PERSONALITY_FEATURES)}):")
    for f in loader.PERSONALITY_FEATURES:
        exists = f in df.columns if df is not None else False
        print(f"  {'[x]' if exists else '[ ]'} {f}")
    
    print(f"\nLIFESTYLE_FEATURES ({len(loader.LIFESTYLE_FEATURES)}):")
    for f in loader.LIFESTYLE_FEATURES:
        exists = f in df.columns if df is not None else False
        print(f"  {'[x]' if exists else '[ ]'} {f}")
    
    print(f"\nSOCIOECONOMIC_FEATURES ({len(loader.SOCIOECONOMIC_FEATURES)}):")
    for f in loader.SOCIOECONOMIC_FEATURES:
        exists = f in df.columns if df is not None else False
        print(f"  {'[x]' if exists else '[ ]'} {f}")

In [None]:
# Use get_feature_groups method
if loader and loader.data is not None:
    feature_groups = loader.get_feature_groups()
    print("Available feature groups (from get_feature_groups):")
    for group, features in feature_groups.items():
        print(f"\n{group} ({len(features)} features):")
        for f in features:
            print(f"    {f}")

In [None]:
# Get numeric vs categorical features
if loader and loader.data is not None:
    numeric_features = loader.get_numeric_features()
    categorical_features = loader.get_categorical_features()
    
    print(f"Numeric features ({len(numeric_features)}):")
    for f in numeric_features[:20]:
        print(f"  {f}")
    if len(numeric_features) > 20:
        print(f"  ... and {len(numeric_features) - 20} more")
    
    print(f"\nCategorical features ({len(categorical_features)}):")
    for f in categorical_features[:20]:
        print(f"  {f}")
    if len(categorical_features) > 20:
        print(f"  ... and {len(categorical_features) - 20} more")

## 7. Data Quality Checks

In [None]:
if loader and loader.data is not None:
    df = loader.data
    
    print("Data Quality Checks:")
    print("="*60)
    
    # Missing values
    print("\n1. Missing values (top 20 columns):")
    missing = df.isnull().sum().sort_values(ascending=False)
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame({'missing': missing, 'pct': missing_pct})
    display(missing_df.head(20))

In [None]:
# Check for duplicate subject codes
if loader and loader.data is not None:
    df = loader.data
    
    print("\n2. Duplicate subject codes:")
    dup_subjects = df['Subject Code'].value_counts()
    dup_subjects = dup_subjects[dup_subjects > 1]
    
    if len(dup_subjects) > 0:
        print(f"  WARNING: {len(dup_subjects)} subjects have multiple entries:")
        display(dup_subjects.head(10))
    else:
        print("  OK: No duplicate subject codes")

In [None]:
# Check value ranges for key metrics
if loader and loader.data is not None:
    df = loader.data
    
    print("\n3. Value ranges for mental health scales:")
    
    # Expected ranges
    expected_ranges = {
        'PHQ9': (0, 27),       # Depression scale
        'GAD7': (0, 21),       # Anxiety scale
        'OASIS': (0, 20),      # Overall Anxiety
        'PCL-5': (0, 80),      # PTSD checklist
        'PSQI': (0, 21),       # Sleep quality
        'Age': (18, 100),      # Reasonable age range
    }
    
    for col, (exp_min, exp_max) in expected_ranges.items():
        if col in df.columns:
            actual_min = df[col].min()
            actual_max = df[col].max()
            n_out_of_range = ((df[col] < exp_min) | (df[col] > exp_max)).sum()
            
            status = "OK" if n_out_of_range == 0 else f"WARNING ({n_out_of_range} out of range)"
            print(f"  {col}: [{actual_min}, {actual_max}] expected [{exp_min}, {exp_max}] - {status}")
        else:
            print(f"  {col}: NOT FOUND in data")

In [None]:
# Check categorical value distributions
if loader and loader.data is not None:
    df = loader.data
    
    print("\n4. Categorical value distributions:")
    
    for col in ['Gender', 'DominantHand', 'Marital Status', 'Education']:
        if col in df.columns:
            print(f"\n  {col}:")
            val_counts = df[col].value_counts(dropna=False)
            for val, count in val_counts.items():
                pct = count / len(df) * 100
                print(f"    {val}: {count} ({pct:.1f}%)")

## 8. Summary Statistics for Mental Health Scales

In [None]:
if loader and loader.data is not None:
    df = loader.data
    
    mental_health_cols = [c for c in loader.MENTAL_HEALTH_FEATURES if c in df.columns]
    
    if mental_health_cols:
        print("Mental Health Scale Statistics:")
        display(df[mental_health_cols].describe().round(2))

In [None]:
# Correlation matrix for mental health scales
if loader and loader.data is not None:
    import matplotlib.pyplot as plt
    
    df = loader.data
    mental_health_cols = [c for c in loader.MENTAL_HEALTH_FEATURES if c in df.columns]
    
    if len(mental_health_cols) >= 2:
        corr = df[mental_health_cols].corr()
        
        fig, ax = plt.subplots(figsize=(10, 8))
        im = ax.imshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
        
        ax.set_xticks(range(len(mental_health_cols)))
        ax.set_yticks(range(len(mental_health_cols)))
        ax.set_xticklabels(mental_health_cols, rotation=45, ha='right')
        ax.set_yticklabels(mental_health_cols)
        
        plt.colorbar(im, ax=ax, label='Correlation')
        plt.title('Mental Health Scale Correlations')
        plt.tight_layout()
        plt.show()

## 9. Participant Summary

In [None]:
# Get summary for a specific participant
if loader and loader.data is not None:
    df = loader.data
    
    # Pick first participant
    test_subject = df['Subject Code'].iloc[0]
    print(f"Summary for participant: {test_subject}")
    
    summary = loader.summarize_participant(test_subject)
    
    for category, data in summary.items():
        print(f"\n{category.upper()}:")
        for key, value in data.items():
            print(f"  {key}: {value}")

## 10. Cohort Statistics

In [None]:
# Get cohort statistics for all participants
if loader and loader.data is not None:
    df = loader.data
    
    all_subjects = df['Subject Code'].tolist()
    stats = loader.get_cohort_statistics(all_subjects)
    
    print(f"Cohort statistics for {len(all_subjects)} participants:")
    
    # Convert to DataFrame for nice display
    stats_df = pd.DataFrame(stats).T
    display(stats_df.round(2))

## 11. Merge with Sessions CSV

In [None]:
# Load sessions CSV and merge
if loader and loader.data is not None and SESSIONS_CSV and Path(SESSIONS_CSV).exists():
    sessions_df = pd.read_csv(SESSIONS_CSV, dtype={'subject_code': str, 'session_id': str})
    print(f"Sessions CSV: {len(sessions_df)} rows")
    
    # Merge
    merged = loader.merge_with_sessions(
        sessions_df,
        on='Subject Code',
        subject_col='subject_code'
    )
    
    print(f"Merged DataFrame: {len(merged)} rows")
    
    # Check merge success
    n_matched = merged['Subject Code'].notna().sum()
    n_unmatched = merged['Subject Code'].isna().sum()
    print(f"\nMatched: {n_matched} sessions")
    print(f"Unmatched: {n_unmatched} sessions")
    
    display(merged.head())

In [None]:
# Show unmatched sessions (if any)
if 'merged' in dir() and merged is not None:
    unmatched = merged[merged['Subject Code'].isna()]
    if len(unmatched) > 0:
        print(f"Unmatched sessions ({len(unmatched)} total):")
        display(unmatched[['subject_code', 'session_id']].head(20))
        
        # Check if it's a subject code format issue
        print("\nSample session subject codes:")
        print(unmatched['subject_code'].head(5).tolist())
        
        print("\nSample questionnaire subject codes:")
        print(loader.data['Subject Code'].head(5).tolist())

## 12. Visualizations

In [None]:
# Age distribution
if loader and loader.data is not None:
    import matplotlib.pyplot as plt
    
    df = loader.data
    
    if 'Age' in df.columns:
        fig, ax = plt.subplots(figsize=(10, 4))
        df['Age'].hist(bins=30, ax=ax)
        ax.set_xlabel('Age')
        ax.set_ylabel('Count')
        ax.set_title('Age Distribution')
        plt.tight_layout()
        plt.show()

In [None]:
# Mental health score distributions
if loader and loader.data is not None:
    import matplotlib.pyplot as plt
    
    df = loader.data
    mh_cols = [c for c in ['PHQ9', 'GAD7', 'OASIS', 'PCL-5'] if c in df.columns]
    
    if mh_cols:
        fig, axes = plt.subplots(1, len(mh_cols), figsize=(4*len(mh_cols), 4))
        if len(mh_cols) == 1:
            axes = [axes]
        
        for ax, col in zip(axes, mh_cols):
            df[col].hist(bins=20, ax=ax)
            ax.set_xlabel(col)
            ax.set_ylabel('Count')
            ax.set_title(f'{col} Distribution')
        
        plt.tight_layout()
        plt.show()

## 13. Debug Specific Issues

Use this section to debug specific issues you encounter.

In [None]:
# Debug a specific subject
debug_subject = ""  # Fill in subject code

if debug_subject and loader and loader.data is not None:
    df = loader.data
    
    # Try to find the subject
    exact_match = df[df['Subject Code'] == debug_subject]
    
    if len(exact_match) > 0:
        print(f"Found exact match for: {debug_subject}")
        display(exact_match.T)  # Transpose for easier reading
    else:
        # Try partial match
        partial_matches = df[df['Subject Code'].str.contains(debug_subject, na=False)]
        if len(partial_matches) > 0:
            print(f"No exact match, but found {len(partial_matches)} partial matches:")
            print(partial_matches['Subject Code'].tolist())
        else:
            print(f"Subject {debug_subject} not found in questionnaire data")

In [None]:
# Check a specific column for issues
debug_column = ""  # Fill in column name

if debug_column and loader and loader.data is not None:
    df = loader.data
    
    if debug_column in df.columns:
        print(f"Column: {debug_column}")
        print(f"  Dtype: {df[debug_column].dtype}")
        print(f"  Non-null: {df[debug_column].notna().sum()}")
        print(f"  Null: {df[debug_column].isna().sum()}")
        
        if df[debug_column].dtype in ['float64', 'int64']:
            print(f"  Min: {df[debug_column].min()}")
            print(f"  Max: {df[debug_column].max()}")
            print(f"  Mean: {df[debug_column].mean():.2f}")
        else:
            print(f"  Unique values: {df[debug_column].nunique()}")
            print(f"  Value counts:")
            print(df[debug_column].value_counts().head(10).to_string())
    else:
        print(f"Column {debug_column} not found")
        # Find similar column names
        similar = [c for c in df.columns if debug_column.lower() in c.lower()]
        if similar:
            print(f"Similar columns: {similar}")