# 03 · Quality report
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Fallrisk-gait/Fallrisk-gait/blob/main/datasets/fallrisk/notebooks/03_qc.ipynb)

Compare the synthetic table against the seed cohort using `sdmetrics.reports.single_table.QualityReport`. When sdmetrics is not installed (e.g. offline execution), fall back to a minimal report that measures per-column mean/std or distribution deltas before serializing `reports/quality_report.json`.

In [None]:
from pathlib import Path
import csv
import json
import math
import random

def locate_repo_root(max_depth: int = 6) -> Path:
    here = Path.cwd()
    for _ in range(max_depth):
        if (here / 'datasets').exists() and (here / 'data').exists():
            return here
        if here.parent == here:
            break
        here = here.parent
    return Path.cwd()


ROOT = locate_repo_root()
DATA_DIR = ROOT / 'data'
OUTPUT_DIR = ROOT / 'datasets' / 'fallrisk'
REPORT_DIR = OUTPUT_DIR / 'reports'
REPORT_DIR.mkdir(parents=True, exist_ok=True)
SCHEMA_PATH = OUTPUT_DIR / 'schema.json'
with SCHEMA_PATH.open() as f:
    SCHEMA = json.load(f)

def build_sdmetrics_metadata(schema: dict) -> dict:
    type_mapping = {
        'float': 'numerical',
        'integer': 'numerical',
        'categorical': 'categorical',
        'binary': 'boolean',
        'string': 'id',
    }
    metadata = {'columns': {}, 'constraints': []}
    for column, info in schema.get('fields', {}).items():
        sdtype = type_mapping.get(info.get('type'), 'categorical')
        metadata['columns'][column] = {'sdtype': sdtype}
    primary_key = schema.get('primary_key')
    if primary_key:
        metadata['primary_key'] = primary_key
    return metadata


SDMETRICS_METADATA = build_sdmetrics_metadata(SCHEMA)


In [None]:
seed_path = DATA_DIR / 'seed_fallrisk.csv'
synthetic_path = OUTPUT_DIR / 'fallrisk_tabular_v1.csv'
with seed_path.open() as f:
    seed_rows = list(csv.DictReader(f))
with synthetic_path.open() as f:
    synth_rows = list(csv.DictReader(f))

if seed_rows and 'bmi' not in seed_rows[0]:
    print('Raw seed rows detected; deriving tabular features for quality comparison.')

    def clamp(value: float, lower: float, upper: float) -> float:
        return max(lower, min(upper, value))

    def logistic(x: float) -> float:
        return 1.0 / (1.0 + math.exp(-x))

    def derive_tabular_seed(rows):
        derived = []
        for base in rows:
            age = float(base.get('age_years', 72.0))
            sex = base.get('sex', 'Female')
            if sex not in {'Male', 'Female'}:
                sex = 'Female'
            gait = clamp(float(base.get('gait_speed_mps', 1.0)), 0.45, 1.55)
            stride_cm = clamp(float(base.get('stride_length_m', 1.2)) * 100.0, 85.0, 150.0)
            double_support = float(base.get('double_support_pct', 28.0))
            symmetry = float(base.get('symmetry_index', 0.12))
            tug_seed = float(base.get('tug_seconds', 11.0))
            risk_level = base.get('fall_risk', 'moderate')

            fall_options = [0, 1, 2, 3]
            if risk_level == 'high':
                fall_weights = [0.12, 0.44, 0.28, 0.16]
            elif risk_level == 'moderate':
                fall_weights = [0.58, 0.28, 0.11, 0.03]
            else:
                fall_weights = [0.85, 0.13, 0.02, 0.0]
            falls = random.choices(fall_options, weights=fall_weights, k=1)[0]
            if tug_seed > 13.0 or double_support > 34.0:
                if falls < 3 and random.random() < 0.35:
                    falls += 1
            falls = int(clamp(falls, 0, 3))

            if gait < 0.75 or double_support > 36.0:
                assistive = 'Walker' if random.random() < 0.55 else 'Cane'
            elif gait < 0.9 or double_support > 32.0:
                assistive = 'Cane' if random.random() < 0.65 else 'None'
            elif gait < 1.0 or double_support > 28.0:
                assistive = 'Cane' if random.random() < 0.28 else 'None'
            else:
                assistive = 'None'

            bmi_mu = 27.2 + 0.09 * (age - 72) - 1.5 * (gait - 1.0) + (0.6 if sex == 'Female' else 0.0)
            bmi = clamp(random.gauss(bmi_mu, 1.25), 18.0, 39.5)

            bp_mu = 128.0 + 0.9 * (age - 70) - 5.0 * (gait - 1.0) + 0.06 * (double_support - 28)
            systolic = clamp(random.gauss(bp_mu, 9.5), 95.0, 190.0)

            sway_mu = 1.6 + 0.05 * (double_support - 28) + 2.6 * symmetry + 0.8 * max(0.0, 0.9 - gait)
            sway = clamp(random.gauss(sway_mu, 0.25), 0.4, 6.5)

            med_mu = 3.0 + 0.12 * (age - 70) + 0.4 * max(0.0, double_support - 30) / 5 + 0.35 * falls
            if assistive != 'None':
                med_mu += 0.6
            medication = int(round(clamp(random.gauss(med_mu, 1.1), 0, 11)))

            chronic_mu = 1.5 + 0.07 * (age - 70) + 0.4 * (medication - 3) + 0.3 * falls
            chronic = int(round(clamp(random.gauss(chronic_mu, 0.8), 0, 7)))

            dual_mu = 9.0 + 0.65 * (tug_seed - 9.5) + 0.09 * (double_support - 28) + 0.35 * falls
            if assistive != 'None':
                dual_mu += 2.4
            dual_task = clamp(random.gauss(dual_mu, 2.4), 0.0, 45.0)

            fear_mu = 9.0 + 1.05 * (tug_seed - 9.5) + 1.5 * falls
            if assistive != 'None':
                fear_mu += 2.0
            fear = int(round(clamp(random.gauss(fear_mu, 2.5), 0, 28)))

            strength_mu = 69.0 + 12.0 * (gait - 1.0) - 0.45 * (age - 70) - 6.0 * max(0.0, symmetry - 0.12)
            strength = clamp(random.gauss(strength_mu, 3.5), 40.0, 95.0)

            reaction_mu = 620.0 - 65.0 * (gait - 1.0) + 20.0 * (tug_seed - 10.0) + 14.0 * falls + 3.0 * max(0.0, double_support - 28)
            reaction = clamp(random.gauss(reaction_mu, 28.0), 360.0, 920.0)

            record = {
                'participant_id': base.get('participant_id', 'SEED_00000'),
                'age_years': round(clamp(age, 55.0, 95.0), 1),
                'sex': sex,
                'bmi': round(bmi, 1),
                'systolic_bp': round(systolic, 1),
                'gait_speed_m_s': round(gait, 3),
                'stride_length_cm': round(stride_cm, 1),
                'postural_sway_cm': round(sway, 3),
                'medication_count': medication,
                'chronic_conditions': chronic,
                'past_falls_6mo': falls,
                'assistive_device': assistive,
                'dual_task_cost_percent': round(dual_task, 2),
                'fear_of_falling_score': fear,
                'muscle_strength_score': round(strength, 1),
                'reaction_time_ms': round(reaction, 1),
            }
            derived.append(record)
        return derived

    def apply_label_policy(record: dict) -> dict:
        tug = (8.5 + 0.065 * (record['age_years'] - 65) + 0.085 * (record['bmi'] - 26)
               - 2.2 * (record['gait_speed_m_s'] - 1.0) + 0.2 * max(0, record['postural_sway_cm'] - 2.4)
               + 0.065 * record['dual_task_cost_percent'] + 0.042 * record['fear_of_falling_score']
               + 0.5 * record['past_falls_6mo'] + (2.1 if record['assistive_device'] != 'None' else 0))
        tug += random.gauss(0, 1.1)
        tug = max(6.2, min(37.0, tug))
        record['tug_seconds'] = round(tug, 3)
        high = int(tug >= 13.5 or record['gait_speed_m_s'] < 0.8 or record['past_falls_6mo'] >= 1
                   or record['dual_task_cost_percent'] >= 22 or record['assistive_device'] != 'None')
        record['label_high_fall_risk'] = high
        if high:
            record['label_risk_level'] = 'high'
        else:
            moderate = (tug >= 11.2 or record['fear_of_falling_score'] >= 16
                        or record['medication_count'] >= 6 or record['chronic_conditions'] >= 3)
            record['label_risk_level'] = 'moderate' if moderate else 'low'
        return record

    random.seed(99)
    engineered_rows = derive_tabular_seed(seed_rows)
    seed_rows = [apply_label_policy(row) for row in engineered_rows]
    for col in ('medication_count', 'chronic_conditions', 'past_falls_6mo', 'fear_of_falling_score', 'label_high_fall_risk'):
        for row in seed_rows:
            row[col] = int(row[col])
    for row in seed_rows:
        row['participant_id'] = str(row['participant_id'])
    print(f'Derived {len(seed_rows)} seed feature rows.')

print(f'Loaded {len(seed_rows)} seed rows and {len(synth_rows)} synthetic rows')

numeric_cols = ['age_years','bmi','systolic_bp','gait_speed_m_s','stride_length_cm','postural_sway_cm',
                'medication_count','chronic_conditions','past_falls_6mo','dual_task_cost_percent',
                'fear_of_falling_score','muscle_strength_score','reaction_time_ms','tug_seconds']
categorical_cols = ['sex','assistive_device','label_high_fall_risk','label_risk_level']


In [None]:
try:
    from sdmetrics.reports.single_table import QualityReport as SDVQualityReport  # type: ignore
    SDMETRICS_AVAILABLE = True
except Exception:
    SDMETRICS_AVAILABLE = False

if SDMETRICS_AVAILABLE:
    print('sdmetrics detected. If dependencies are missing, fallback logic will still be used.')


In [None]:
def compute_numeric_report():
    report = {}
    for col in numeric_cols:
        seed_vals = [float(r[col]) for r in seed_rows]
        synth_vals = [float(r[col]) for r in synth_rows]
        seed_mean = sum(seed_vals) / len(seed_vals)
        synth_mean = sum(synth_vals) / len(synth_vals)
        seed_var = sum((v - seed_mean) ** 2 for v in seed_vals) / len(seed_vals)
        synth_var = sum((v - synth_mean) ** 2 for v in synth_vals) / len(synth_vals)
        seed_std = seed_var ** 0.5
        synth_std = synth_var ** 0.5
        mean_delta = abs(seed_mean - synth_mean) / (abs(seed_mean) + 1e-6)
        std_delta = abs(seed_std - synth_std) / (seed_std + 1e-6) if seed_std else 0.0
        score = max(0.0, 1.0 - 0.5 * (mean_delta + std_delta))
        report[col] = {
            'seed_mean': seed_mean,
            'synth_mean': synth_mean,
            'seed_std': seed_std,
            'synth_std': synth_std,
            'score': score
        }
    return report

def compute_categorical_report():
    report = {}
    for col in categorical_cols:
        seed_counts = {}
        synth_counts = {}
        for row in seed_rows:
            seed_counts[row[col]] = seed_counts.get(row[col], 0) + 1
        for row in synth_rows:
            synth_counts[row[col]] = synth_counts.get(row[col], 0) + 1
        all_keys = set(seed_counts) | set(synth_counts)
        delta = 0.0
        for key in all_keys:
            seed_p = seed_counts.get(key, 0) / len(seed_rows)
            synth_p = synth_counts.get(key, 0) / len(synth_rows)
            delta += abs(seed_p - synth_p)
        score = max(0.0, 1.0 - 0.5 * delta)
        report[col] = {
            'seed_distribution': seed_counts,
            'synth_distribution': synth_counts,
            'score': score
        }
    return report


In [None]:
if SDMETRICS_AVAILABLE:
    print('Using sdmetrics QualityReport for diagnostics.')
    quality_report = SDVQualityReport()
    try:
        import pandas as pd  # type: ignore
    except Exception:
        pd = None
    if pd is not None:
        seed_df = pd.DataFrame(seed_rows)
        synth_df = pd.DataFrame(synth_rows)
        for column, field in SCHEMA.get('fields', {}).items():
            field_type = field.get('type')
            if field_type in {'float', 'integer', 'binary'}:
                seed_df[column] = pd.to_numeric(seed_df[column], errors='coerce')
                synth_df[column] = pd.to_numeric(synth_df[column], errors='coerce')
                if field_type == 'binary':
                    seed_df[column] = seed_df[column].astype('Int64')
                    synth_df[column] = synth_df[column].astype('Int64')
            elif field_type == 'categorical':
                seed_df[column] = seed_df[column].astype('category')
                synth_df[column] = synth_df[column].astype('category')
            else:
                seed_df[column] = seed_df[column].astype(str)
                synth_df[column] = synth_df[column].astype(str)
        quality_report.generate(
            real_data=seed_df,
            synthetic_data=synth_df,
            metadata=SDMETRICS_METADATA,
        )
        try:
            overall_score = quality_report.get_score()
        except Exception:
            overall_score = None
        fallback_payload = None
    else:
        print('pandas not available; falling back to lightweight implementation.')
        SDMETRICS_AVAILABLE = False
else:
    quality_report = None
    fallback_payload = None
if not SDMETRICS_AVAILABLE:
    numeric_report = compute_numeric_report()
    categorical_report = compute_categorical_report()
    numeric_scores = [info['score'] for info in numeric_report.values()]
    categorical_scores = [info['score'] for info in categorical_report.values()]
    overall_score = sum(numeric_scores + categorical_scores) / (len(numeric_scores) + len(categorical_scores))
    fallback_payload = {'numeric': numeric_report, 'categorical': categorical_report, 'overall_score': overall_score}
    print('Fallback quality score used.')
print(f'Overall quality score: {overall_score:.3f}' if overall_score is not None else 'Overall score unavailable from sdmetrics.')


In [None]:
report_path = REPORT_DIR / 'quality_report.json'
pkl_path = REPORT_DIR / 'quality_report.pkl'


def _to_serializable(obj):
    if isinstance(obj, (str, int, float, bool)) or obj is None:
        return obj
    if isinstance(obj, dict):
        return {str(k): _to_serializable(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple, set)):
        return [_to_serializable(v) for v in obj]
    if hasattr(obj, "tolist"):
        try:
            return obj.tolist()
        except Exception:
            pass
    return str(obj)

if fallback_payload is not None:
    payload = fallback_payload
elif quality_report is not None:
    payload = None
    for attr in ("to_dict", "get_details", "get_results"):
        if hasattr(quality_report, attr):
            try:
                candidate = getattr(quality_report, attr)()
            except Exception:
                candidate = None
            if candidate:
                payload = candidate
                break
    if payload is None:
        payload = {"overall_score": overall_score}
else:
    payload = {"overall_score": overall_score}

with report_path.open("w") as f:
    json.dump(payload, f, indent=2, default=_to_serializable)

if quality_report is not None:
    to_pickle = {
        'quality_report': quality_report,
        'metadata': SDMETRICS_METADATA,
        'overall_score': overall_score,
    }
else:
    to_pickle = payload

import pickle
with pkl_path.open('wb') as f:
    pickle.dump(to_pickle, f)

print(f'Report serialized to {report_path.resolve()}')
print(f'Pickle serialized to {pkl_path.resolve()}')
if fallback_payload is not None:
    print('Numeric sample:', list(fallback_payload['numeric'].items())[:2])
