# 02 · Gaussian copula synthesis
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Fallrisk-gait/Fallrisk-gait/blob/main/datasets/fallrisk/notebooks/02_synthesize.ipynb)

Fit an SDV `GaussianCopulaSynthesizer` (or a compatible fallback when the library is unavailable), sample 50k synthetic rows, recompute derived fields and labels, and persist both `fallrisk_tabular_v1.csv` and the 1k preview subset.

In [1]:
%pip -q install -U pandas numpy scikit-learn matplotlib sdv>=1.7,<2 sdmetrics>=0.13,<0.14
from pathlib import Path
import csv
import math
import random
from typing import List

def locate_repo_root(max_depth: int = 6) -> Path:
    here = Path.cwd()
    for _ in range(max_depth):
        if (here / 'datasets').exists() and (here / 'data').exists():
            return here
        if here.parent == here:
            break
        here = here.parent
    return Path.cwd()

ROOT = locate_repo_root()
DATA_DIR = ROOT / 'data'
OUTPUT_DIR = ROOT / 'datasets' / 'fallrisk'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
random.seed(99)


In [2]:
seed_path = DATA_DIR / 'seed_fallrisk.csv'
with seed_path.open() as f:
    raw_seed_rows = list(csv.DictReader(f))
print(f'Loaded {len(raw_seed_rows)} raw seed rows from {seed_path.resolve()}')

continuous_cols = [
    'age_years','bmi','systolic_bp','gait_speed_m_s','stride_length_cm','postural_sway_cm',
    'medication_count','chronic_conditions','dual_task_cost_percent','fear_of_falling_score',
    'muscle_strength_score','reaction_time_ms'
]

try:
    import pandas as pd  # type: ignore
    from sdv.metadata import SingleTableMetadata  # type: ignore
    from sdv.single_table import GaussianCopulaSynthesizer as SDVGaussianCopulaSynthesizer  # type: ignore
    SDV_AVAILABLE = True
    SDV_IMPORT_ERROR: Exception | None = None
except Exception as exc:  # pragma: no cover - best effort import logging
    SDV_AVAILABLE = False
    SDV_IMPORT_ERROR = exc
    pd = None  # type: ignore
    SingleTableMetadata = None  # type: ignore
    SDVGaussianCopulaSynthesizer = None  # type: ignore

def clamp(value: float, lower: float, upper: float) -> float:
    return max(lower, min(upper, value))

def derive_tabular_seed(rows: List[dict]) -> List[dict]:
    derived: List[dict] = []
    for base in rows:
        age = float(base['age_years'])
        sex = base.get('sex', 'Female')
        if sex not in {'Male', 'Female'}:
            sex = 'Female'
        gait = clamp(float(base['gait_speed_mps']), 0.45, 1.55)
        stride_cm = clamp(float(base['stride_length_m']) * 100.0, 85.0, 150.0)
        double_support = float(base['double_support_pct'])
        symmetry = float(base['symmetry_index'])
        tug_seed = float(base['tug_seconds'])
        risk_level = base.get('fall_risk', 'moderate')

        fall_options = [0, 1, 2, 3]
        if risk_level == 'high':
            fall_weights = [0.12, 0.44, 0.28, 0.16]
        elif risk_level == 'moderate':
            fall_weights = [0.58, 0.28, 0.11, 0.03]
        else:
            fall_weights = [0.85, 0.13, 0.02, 0.0]
        falls = random.choices(fall_options, weights=fall_weights, k=1)[0]
        if tug_seed > 13.0 or double_support > 34.0:
            if falls < 3 and random.random() < 0.35:
                falls += 1
        falls = int(clamp(falls, 0, 3))

        if gait < 0.75 or double_support > 36.0:
            assistive = 'Walker' if random.random() < 0.55 else 'Cane'
        elif gait < 0.9 or double_support > 32.0:
            assistive = 'Cane' if random.random() < 0.65 else 'None'
        elif gait < 1.0 or double_support > 28.0:
            assistive = 'Cane' if random.random() < 0.28 else 'None'
        else:
            assistive = 'None'

        bmi_mu = 27.2 + 0.09 * (age - 72) - 1.5 * (gait - 1.0) + (0.6 if sex == 'Female' else 0.0)
        bmi = clamp(random.gauss(bmi_mu, 1.25), 18.0, 39.5)

        bp_mu = 128.0 + 0.9 * (age - 70) - 5.0 * (gait - 1.0) + 0.06 * (double_support - 28)
        systolic = clamp(random.gauss(bp_mu, 9.5), 95.0, 190.0)

        sway_mu = 1.6 + 0.05 * (double_support - 28) + 2.6 * symmetry + 0.8 * max(0.0, 0.9 - gait)
        sway = clamp(random.gauss(sway_mu, 0.25), 0.4, 6.5)

        med_mu = 3.0 + 0.12 * (age - 70) + 0.4 * max(0.0, double_support - 30) / 5 + 0.35 * falls
        if assistive != 'None':
            med_mu += 0.6
        medication = int(round(clamp(random.gauss(med_mu, 1.1), 0, 11)))

        chronic_mu = 1.5 + 0.07 * (age - 70) + 0.4 * (medication - 3) + 0.3 * falls
        chronic = int(round(clamp(random.gauss(chronic_mu, 0.8), 0, 7)))

        dual_mu = 9.0 + 0.65 * (tug_seed - 9.5) + 0.09 * (double_support - 28) + 0.35 * falls
        if assistive != 'None':
            dual_mu += 2.4
        dual_task = clamp(random.gauss(dual_mu, 2.4), 0.0, 45.0)

        fear_mu = 9.0 + 1.05 * (tug_seed - 9.5) + 1.5 * falls
        if assistive != 'None':
            fear_mu += 2.0
        fear = int(round(clamp(random.gauss(fear_mu, 2.5), 0, 28)))

        strength_mu = 69.0 + 12.0 * (gait - 1.0) - 0.45 * (age - 70) - 6.0 * max(0.0, symmetry - 0.12)
        strength = clamp(random.gauss(strength_mu, 3.5), 40.0, 95.0)

        reaction_mu = 620.0 - 65.0 * (gait - 1.0) + 20.0 * (tug_seed - 10.0) + 14.0 * falls + 3.0 * max(0.0, double_support - 28)
        reaction = clamp(random.gauss(reaction_mu, 28.0), 360.0, 920.0)

        record = {
            'participant_id': base['participant_id'],
            'age_years': round(clamp(age, 55.0, 95.0), 1),
            'sex': sex,
            'bmi': round(bmi, 1),
            'systolic_bp': round(systolic, 1),
            'gait_speed_m_s': round(gait, 3),
            'stride_length_cm': round(stride_cm, 1),
            'postural_sway_cm': round(sway, 3),
            'medication_count': medication,
            'chronic_conditions': chronic,
            'past_falls_6mo': falls,
            'assistive_device': assistive,
            'dual_task_cost_percent': round(dual_task, 2),
            'fear_of_falling_score': fear,
            'muscle_strength_score': round(strength, 1),
            'reaction_time_ms': round(reaction, 1),
        }
        derived.append(record)
    return derived

seed_feature_rows = derive_tabular_seed(raw_seed_rows)
print(f'Derived engineered seed feature rows: {len(seed_feature_rows)}')


Loaded 2000 raw seed rows from /workspace/Fallrisk-gait/data/seed_fallrisk.csv


Derived engineered seed feature rows: 2000


In [3]:
def logistic(x: float) -> float:
    return 1.0 / (1.0 + math.exp(-x))

def apply_label_policy(record: dict) -> dict:
    tug = (8.5 + 0.065 * (record['age_years'] - 65) + 0.085 * (record['bmi'] - 26)
           - 2.2 * (record['gait_speed_m_s'] - 1.0) + 0.2 * max(0, record['postural_sway_cm'] - 2.4)
           + 0.065 * record['dual_task_cost_percent'] + 0.042 * record['fear_of_falling_score']
           + 0.5 * record['past_falls_6mo'] + (2.1 if record['assistive_device'] != 'None' else 0))
    tug += random.gauss(0, 1.1)
    tug = max(6.2, min(37.0, tug))
    record['tug_seconds'] = round(tug, 3)
    high = int(tug >= 13.5 or record['gait_speed_m_s'] < 0.8 or record['past_falls_6mo'] >= 1
               or record['dual_task_cost_percent'] >= 22 or record['assistive_device'] != 'None')
    record['label_high_fall_risk'] = high
    if high:
        record['label_risk_level'] = 'high'
    else:
        moderate = (tug >= 11.2 or record['fear_of_falling_score'] >= 16
                    or record['medication_count'] >= 6 or record['chronic_conditions'] >= 3)
        record['label_risk_level'] = 'moderate' if moderate else 'low'
    return record

seed_rows = [apply_label_policy(row.copy()) for row in seed_feature_rows]
for col in ('medication_count', 'chronic_conditions', 'past_falls_6mo', 'fear_of_falling_score', 'label_high_fall_risk'):
    for row in seed_rows:
        row[col] = int(row[col])
print('Applied label policy to engineered seed rows.')


Applied label policy to engineered seed rows.


In [4]:
if SDV_AVAILABLE:
    print('SDV is available. Metadata-driven synthesizer will be attempted first.')
else:
    print('SDV unavailable; using lightweight GaussianCopulaSynthesizer fallback.')
    if SDV_IMPORT_ERROR is not None:
        print(f'  Import error: {SDV_IMPORT_ERROR}')

class GaussianCopulaSynthesizer:
    def __init__(self, columns):
        self.columns = columns
        self.means = {col: 0.0 for col in columns}
        self.cov = [[0.0 for _ in columns] for _ in columns]

    def fit(self, data):
        n = len(data)
        for col in self.columns:
            vals = [float(row[col]) for row in data]
            self.means[col] = sum(vals) / n
        for i, ci in enumerate(self.columns):
            for j, cj in enumerate(self.columns):
                total = 0.0
                for row in data:
                    total += (float(row[ci]) - self.means[ci]) * (float(row[cj]) - self.means[cj])
                value = total / n
                if i == j:
                    value += 1e-3
                self.cov[i][j] = value
        self._chol = self._cholesky(self.cov)

    @staticmethod
    def _cholesky(matrix):
        n = len(matrix)
        L = [[0.0] * n for _ in range(n)]
        for i in range(n):
            for j in range(i + 1):
                s = sum(L[i][k] * L[j][k] for k in range(j))
                if i == j:
                    val = matrix[i][i] - s
                    if val < 1e-6:
                        val = 1e-6
                    L[i][j] = math.sqrt(val)
                else:
                    L[i][j] = (matrix[i][j] - s) / L[j][j] if L[j][j] else 0.0
        return L

    def sample(self, num_rows):
        samples = []
        for _ in range(num_rows):
            z = [random.gauss(0, 1) for _ in self.columns]
            values = {}
            for idx, col in enumerate(self.columns):
                mean = self.means[col]
                correlated = mean + sum(self._chol[idx][k] * z[k] for k in range(len(z)))
                values[col] = correlated
            samples.append(values)
        return samples


SDV is available. Metadata-driven synthesizer will be attempted first.


In [5]:
synthetic_records: list[dict] = []

def safe_float(value, default):
    try:
        num = float(value)
    except Exception:
        return default
    if math.isnan(num):
        return default
    return num

if SDV_AVAILABLE and pd is not None:
    try:
        seed_df = pd.DataFrame(seed_rows)
        int_columns = ['medication_count', 'chronic_conditions', 'past_falls_6mo', 'fear_of_falling_score']
        for col in int_columns:
            seed_df[col] = seed_df[col].astype(int)
        seed_df['label_high_fall_risk'] = seed_df['label_high_fall_risk'].astype(bool)
        seed_df['participant_id'] = seed_df['participant_id'].astype(str)

        metadata = SingleTableMetadata()
        metadata.detect_from_dataframe(seed_df)
        metadata.set_primary_key('participant_id')
        for column in ['sex', 'assistive_device', 'label_risk_level']:
            metadata.update_column(column, sdtype='categorical')
        metadata.update_column('label_high_fall_risk', sdtype='boolean')

        print('SDV metadata detection complete:')
        for name, info in metadata.columns.items():
            print(f"  - {name}: {info['sdtype']}")

        sdv_synthesizer = SDVGaussianCopulaSynthesizer(metadata)
        sdv_synthesizer.fit(seed_df)
        sampled_df = sdv_synthesizer.sample(num_rows=50000)
        print('SDV GaussianCopula sampling finished.')

        allowed_sex = {'Female', 'Male'}
        allowed_device = {'None', 'Cane', 'Walker'}

        sanitized = []
        for idx, sample in enumerate(sampled_df.to_dict(orient='records'), start=1):
            sex_value = sample.get('sex', 'Female')
            if sex_value not in allowed_sex:
                sex_value = 'Female'
            device_value = sample.get('assistive_device', 'None')
            if device_value not in allowed_device:
                device_value = 'None'

            rec = {
                'participant_id': f'SYN_{idx:05d}',
                'age_years': round(clamp(safe_float(sample.get('age_years', 72.0), 72.0), 55.0, 95.0), 1),
                'sex': sex_value,
                'bmi': round(clamp(safe_float(sample.get('bmi', 27.2), 27.2), 17.0, 45.0), 1),
                'systolic_bp': round(clamp(safe_float(sample.get('systolic_bp', 132.0), 132.0), 95.0, 195.0), 1),
                'gait_speed_m_s': round(clamp(safe_float(sample.get('gait_speed_m_s', 1.0), 1.0), 0.35, 1.8), 3),
                'stride_length_cm': round(clamp(safe_float(sample.get('stride_length_cm', 120.0), 120.0), 85.0, 150.0), 1),
                'postural_sway_cm': round(clamp(safe_float(sample.get('postural_sway_cm', 2.1), 2.1), 0.4, 7.0), 3),
                'medication_count': int(clamp(round(safe_float(sample.get('medication_count', 4), 4.0)), 0, 12)),
                'chronic_conditions': int(clamp(round(safe_float(sample.get('chronic_conditions', 2), 2.0)), 0, 7)),
                'past_falls_6mo': int(clamp(round(safe_float(sample.get('past_falls_6mo', 0), 0.0)), 0, 3)),
                'assistive_device': device_value,
                'dual_task_cost_percent': round(clamp(safe_float(sample.get('dual_task_cost_percent', 12.0), 12.0), 0.0, 55.0), 2),
                'fear_of_falling_score': int(clamp(round(safe_float(sample.get('fear_of_falling_score', 12), 12.0)), 0, 28)),
                'muscle_strength_score': round(clamp(safe_float(sample.get('muscle_strength_score', 66.0), 66.0), 35.0, 95.0), 1),
                'reaction_time_ms': round(clamp(safe_float(sample.get('reaction_time_ms', 640.0), 640.0), 320.0, 950.0), 1),
            }
            sanitized.append(apply_label_policy(rec))
        synthetic_records = sanitized
    except Exception as exc:
        print(f'SDV synthesizer pipeline failed: {exc}')
        SDV_AVAILABLE = False

if not synthetic_records:
    synthesizer = GaussianCopulaSynthesizer(continuous_cols)
    synthesizer.fit(seed_rows)

    synthetic_records = []
    for idx, sampled in enumerate(synthesizer.sample(50000), start=1):
        age = clamp(sampled['age_years'], 55.0, 95.0)
        bmi = clamp(sampled['bmi'], 17.0, 45.0)
        systolic = clamp(sampled['systolic_bp'], 95.0, 195.0)
        gait = clamp(sampled['gait_speed_m_s'], 0.35, 1.8)
        stride = clamp(sampled['stride_length_cm'], 85.0, 150.0)
        sway = clamp(sampled['postural_sway_cm'], 0.4, 7.0)
        med = int(round(clamp(sampled['medication_count'], 0.0, 12.0)))
        chronic = int(round(clamp(sampled['chronic_conditions'], 0.0, 7.0)))
        dual_task = clamp(sampled['dual_task_cost_percent'], 0.0, 55.0)
        fear = int(round(clamp(sampled['fear_of_falling_score'], 0.0, 28.0)))
        muscle = clamp(sampled['muscle_strength_score'], 35.0, 95.0)
        reaction = clamp(sampled['reaction_time_ms'], 320.0, 950.0)
        fall_prob = logistic(-0.95 + 0.08 * (age - 70) - 1.5 * (gait - 0.9) + 0.05 * (dual_task - 12) + 0.12 * max(chronic - 2, 0))
        rand_fall = random.random()
        if rand_fall < fall_prob * 0.6:
            falls = 1
        elif rand_fall < fall_prob * 0.85:
            falls = 2
        elif rand_fall < fall_prob:
            falls = 3
        else:
            falls = 0
        device_score = logistic(-0.8 + 0.07 * (age - 70) - 2.4 * (gait - 0.9) + 0.03 * (dual_task - 12) + 0.6 * falls)
        if random.random() < device_score:
            assistive = 'Walker' if random.random() < 0.38 + 0.12 * falls else 'Cane'
        else:
            assistive = 'None'
        sex_prob = logistic(-0.35 + 0.01 * (age - 70))
        sex = 'Female' if random.random() < sex_prob else 'Male'
        record = {
            'participant_id': f'SYN_{idx:05d}',
            'age_years': round(age, 1),
            'sex': sex,
            'bmi': round(bmi, 1),
            'systolic_bp': round(systolic, 1),
            'gait_speed_m_s': round(gait, 3),
            'stride_length_cm': round(stride, 1),
            'postural_sway_cm': round(sway, 3),
            'medication_count': med,
            'chronic_conditions': chronic,
            'past_falls_6mo': falls,
            'assistive_device': assistive,
            'dual_task_cost_percent': round(dual_task, 2),
            'fear_of_falling_score': fear,
            'muscle_strength_score': round(muscle, 1),
            'reaction_time_ms': round(reaction, 1),
        }
        synthetic_records.append(apply_label_policy(record))

fieldnames = [
    'participant_id','age_years','sex','bmi','systolic_bp','gait_speed_m_s',
    'stride_length_cm','postural_sway_cm','medication_count','chronic_conditions',
    'past_falls_6mo','assistive_device','dual_task_cost_percent','fear_of_falling_score',
    'muscle_strength_score','reaction_time_ms','tug_seconds','label_high_fall_risk','label_risk_level'
]

full_path = OUTPUT_DIR / 'fallrisk_tabular_v1.csv'
with full_path.open('w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(synthetic_records)

sample_path = OUTPUT_DIR / 'sample_1k.csv'
with sample_path.open('w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(synthetic_records[:1000])

print(f'Synthesized {len(synthetic_records)} rows -> {full_path.resolve()}')
print(f'Sample preview saved to {sample_path.resolve()}')


SDV metadata detection complete:
  - participant_id: id
  - age_years: numerical
  - sex: categorical
  - bmi: numerical
  - systolic_bp: numerical
  - gait_speed_m_s: numerical
  - stride_length_cm: numerical
  - postural_sway_cm: numerical
  - medication_count: categorical
  - chronic_conditions: categorical
  - past_falls_6mo: categorical
  - assistive_device: categorical
  - dual_task_cost_percent: numerical
  - fear_of_falling_score: numerical
  - muscle_strength_score: numerical
  - reaction_time_ms: numerical
  - tug_seconds: numerical
  - label_high_fall_risk: boolean
  - label_risk_level: categorical




SDV GaussianCopula sampling finished.


Synthesized 50000 rows -> /workspace/Fallrisk-gait/datasets/fallrisk/fallrisk_tabular_v1.csv
Sample preview saved to /workspace/Fallrisk-gait/datasets/fallrisk/sample_1k.csv


In [6]:
high_count = sum(r['label_high_fall_risk'] for r in synthetic_records)
moderate_count = sum(1 for r in synthetic_records if r['label_risk_level'] == 'moderate')
low_count = sum(1 for r in synthetic_records if r['label_risk_level'] == 'low')
print(f'Risk level counts -> high: {high_count}, moderate: {moderate_count}, low: {low_count}')


Risk level counts -> high: 24965, moderate: 12376, low: 12659


In [7]:
import json

schema_path = OUTPUT_DIR / 'schema.json'
with schema_path.open() as f:
    schema = json.load(f)

with full_path.open() as f:
    reader = csv.DictReader(f)
    rows = list(reader)

expected_fields = list(schema['fields'].keys())
if reader.fieldnames != expected_fields:
    raise ValueError(f'Field order mismatch: expected {expected_fields}, found {reader.fieldnames}')

expected_row_count = schema.get('rows')
if expected_row_count is not None and len(rows) != expected_row_count:
    raise ValueError(f'Row count mismatch: expected {expected_row_count}, found {len(rows)}')

def _check_range(num: float, rng):
    if rng is None:
        return
    lower, upper = rng
    if lower is not None and num < lower - 1e-6:
        raise ValueError(f'Value {num} below minimum {lower}')
    if upper is not None and num > upper + 1e-6:
        raise ValueError(f'Value {num} above maximum {upper}')

for idx, row in enumerate(rows, start=1):
    for field, spec in schema['fields'].items():
        value = row[field]
        ftype = spec.get('type')
        if ftype in {'float', 'number'}:
            num = float(value)
            _check_range(num, spec.get('range'))
        elif ftype == 'integer':
            num = int(round(float(value)))
            _check_range(num, spec.get('range'))
        elif ftype == 'binary':
            num = int(round(float(value)))
            allowed = {int(v) for v in spec.get('values', [0, 1])}
            if num not in allowed:
                raise ValueError(f'Unexpected binary value {num} for {field} at row {idx}')
        elif ftype == 'categorical':
            allowed = spec.get('values')
            if allowed and value not in allowed:
                raise ValueError(f'Unexpected category {value} for {field} at row {idx}')
        elif ftype == 'string':
            if value is None:
                raise ValueError(f'Missing string value for {field} at row {idx}')
        else:
            num = float(value)
            _check_range(num, spec.get('range'))

print(f"Schema validation passed for {len(rows)} rows against {schema_path.name}.")


Schema validation passed for 50000 rows against schema.json.
