# 03 · Quality report
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ORG/Fallrisk-gait/blob/main/datasets/fallrisk/notebooks/03_qc.ipynb)

Compare the synthetic table against the seed cohort using `sdmetrics.reports.single_table.QualityReport`. When sdmetrics is not installed (e.g. offline execution), fall back to a minimal report that measures per-column mean/std or distribution deltas before serializing `reports/quality_report.json`.

In [None]:
from pathlib import Path
import csv
import json

def locate_repo_root(max_depth: int = 6) -> Path:
    here = Path.cwd()
    for _ in range(max_depth):
        if (here / 'datasets').exists() and (here / 'data').exists():
            return here
        if here.parent == here:
            break
        here = here.parent
    return Path.cwd()

ROOT = locate_repo_root()
DATA_DIR = ROOT / 'data'
OUTPUT_DIR = ROOT / 'datasets' / 'fallrisk'
REPORT_DIR = OUTPUT_DIR / 'reports'
REPORT_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
numeric_cols = [
    'age_years',
    'gait_speed_mps',
    'stride_length_m',
    'cadence_spm',
    'stride_time_var',
    'double_support_pct',
    'symmetry_index',
    'turn_time_s',
    'sit_to_stand_s',
    'stand_to_sit_s',
    'tug_seconds',
    'label_high_fall_risk',
    'policy_b_trigger_count',
    'policy_b_score',
    'policy_a_threshold_moderate',
    'policy_a_threshold_high',
    'policy_b_gait_speed_mps_cutoff_p25',
    'policy_b_gait_speed_mps_cutoff_p10',
    'policy_b_stride_length_m_cutoff_p25',
    'policy_b_stride_length_m_cutoff_p10',
    'policy_b_cadence_spm_cutoff_p25',
    'policy_b_cadence_spm_cutoff_p10',
    'policy_b_stride_time_var_cutoff_p75',
    'policy_b_stride_time_var_cutoff_p90',
    'policy_b_double_support_pct_cutoff_p75',
    'policy_b_double_support_pct_cutoff_p90',
    'policy_b_symmetry_index_cutoff_p75',
    'policy_b_symmetry_index_cutoff_p90',
    'policy_b_turn_time_s_cutoff_p75',
    'policy_b_turn_time_s_cutoff_p90',
    'policy_b_sit_to_stand_s_cutoff_p75',
    'policy_b_sit_to_stand_s_cutoff_p90',
    'policy_b_stand_to_sit_s_cutoff_p75',
    'policy_b_stand_to_sit_s_cutoff_p90',
]

categorical_cols = [
    'sex',
    'policy_a_risk',
    'policy_b_risk',
    'fall_risk',
    'policy_a_trigger',
    'policy_b_trigger',
    'policy_b_high_feature_hits',
    'policy_b_moderate_feature_hits',
]



In [None]:
try:
    from sdmetrics.reports.single_table import QualityReport as SDVQualityReport  # type: ignore
    SDMETRICS_AVAILABLE = True
except Exception:
    SDMETRICS_AVAILABLE = False

if SDMETRICS_AVAILABLE:
    print('sdmetrics detected. If dependencies are missing, fallback logic will still be used.')


In [None]:
def compute_numeric_report():
    report = {}
    for col in numeric_cols:
        seed_vals = [float(r[col]) for r in seed_rows]
        synth_vals = [float(r[col]) for r in synth_rows]
        seed_mean = sum(seed_vals) / len(seed_vals)
        synth_mean = sum(synth_vals) / len(synth_vals)
        seed_var = sum((v - seed_mean) ** 2 for v in seed_vals) / len(seed_vals)
        synth_var = sum((v - synth_mean) ** 2 for v in synth_vals) / len(synth_vals)
        seed_std = seed_var ** 0.5
        synth_std = synth_var ** 0.5
        mean_delta = abs(seed_mean - synth_mean) / (abs(seed_mean) + 1e-6)
        std_delta = abs(seed_std - synth_std) / (seed_std + 1e-6) if seed_std else 0.0
        score = max(0.0, 1.0 - 0.5 * (mean_delta + std_delta))
        report[col] = {
            'seed_mean': seed_mean,
            'synth_mean': synth_mean,
            'seed_std': seed_std,
            'synth_std': synth_std,
            'score': score
        }
    return report

def compute_categorical_report():
    report = {}
    for col in categorical_cols:
        seed_counts = {}
        synth_counts = {}
        for row in seed_rows:
            seed_counts[row[col]] = seed_counts.get(row[col], 0) + 1
        for row in synth_rows:
            synth_counts[row[col]] = synth_counts.get(row[col], 0) + 1
        all_keys = set(seed_counts) | set(synth_counts)
        delta = 0.0
        for key in all_keys:
            seed_p = seed_counts.get(key, 0) / len(seed_rows)
            synth_p = synth_counts.get(key, 0) / len(synth_rows)
            delta += abs(seed_p - synth_p)
        score = max(0.0, 1.0 - 0.5 * delta)
        report[col] = {
            'seed_distribution': seed_counts,
            'synth_distribution': synth_counts,
            'score': score
        }
    return report


In [None]:
if SDMETRICS_AVAILABLE:
    print('Using sdmetrics QualityReport for diagnostics.')
    quality_report = SDVQualityReport()
    try:
        import pandas as pd  # type: ignore
    except Exception:
        pd = None
    if pd is not None:
        seed_df = pd.DataFrame(seed_rows)
        synth_df = pd.DataFrame(synth_rows)
        quality_report.generate(real_data=seed_df, synthetic_data=synth_df)
        try:
            overall_score = quality_report.get_score()
        except Exception:
            overall_score = None
        fallback_payload = None
    else:
        print('pandas not available; falling back to lightweight implementation.')
        SDMETRICS_AVAILABLE = False
else:
    quality_report = None
    fallback_payload = None
if not SDMETRICS_AVAILABLE:
    numeric_report = compute_numeric_report()
    categorical_report = compute_categorical_report()
    numeric_scores = [info['score'] for info in numeric_report.values()]
    categorical_scores = [info['score'] for info in categorical_report.values()]
    overall_score = sum(numeric_scores + categorical_scores) / (len(numeric_scores) + len(categorical_scores))
    fallback_payload = {'numeric': numeric_report, 'categorical': categorical_report, 'overall_score': overall_score}
    print('Fallback quality score used.')
print(f'Overall quality score: {overall_score:.3f}' if overall_score is not None else 'Overall score unavailable from sdmetrics.')


In [None]:
report_path = REPORT_DIR / 'quality_report.json'

def _to_serializable(obj):
    if isinstance(obj, (str, int, float, bool)) or obj is None:
        return obj
    if isinstance(obj, dict):
        return {str(k): _to_serializable(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple, set)):
        return [_to_serializable(v) for v in obj]
    if hasattr(obj, "tolist"):
        try:
            return obj.tolist()
        except Exception:
            pass
    return str(obj)

if fallback_payload is not None:
    payload = fallback_payload
elif quality_report is not None:
    payload = None
    for attr in ("to_dict", "get_details", "get_results"):
        if hasattr(quality_report, attr):
            try:
                candidate = getattr(quality_report, attr)()
            except Exception:
                candidate = None
            if candidate:
                payload = candidate
                break
    if payload is None:
        payload = {"overall_score": overall_score}
else:
    payload = {"overall_score": overall_score}

with report_path.open("w") as f:
    json.dump(payload, f, indent=2, default=_to_serializable)

print(f'Report serialized to {report_path.resolve()}')
if fallback_payload is not None:
    print('Numeric sample:', list(fallback_payload['numeric'].items())[:2])
