# 02 · Gaussian copula synthesis
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ORG/Fallrisk-gait/blob/main/datasets/fallrisk/notebooks/02_synthesize.ipynb)

Fit an SDV `GaussianCopulaSynthesizer` (or a compatible fallback when the library is unavailable), sample 50k synthetic rows, recompute derived fields and labels, and persist both `fallrisk_tabular_v1.csv` and the 1k preview subset.

In [None]:
from pathlib import Path
import csv
import math
import random

def locate_repo_root(max_depth: int = 6) -> Path:
    here = Path.cwd()
    for _ in range(max_depth):
        if (here / 'datasets').exists() and (here / 'data').exists():
            return here
        if here.parent == here:
            break
        here = here.parent
    return Path.cwd()

ROOT = locate_repo_root()
DATA_DIR = ROOT / 'data'
OUTPUT_DIR = ROOT / 'datasets' / 'fallrisk'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
random.seed(99)


In [None]:
seed_path = DATA_DIR / 'seed_fallrisk.csv'
with seed_path.open() as f:
    seed_rows = list(csv.DictReader(f))
print(f'Loaded {len(seed_rows)} seed rows from {seed_path.resolve()}')

continuous_cols = ['age_years','bmi','systolic_bp','gait_speed_m_s','stride_length_cm','postural_sway_cm',
                   'medication_count','chronic_conditions','dual_task_cost_percent','fear_of_falling_score',
                   'muscle_strength_score','reaction_time_ms']

try:
    from sdv.single_table import GaussianCopulaSynthesizer as SDVGaussianCopulaSynthesizer  # type: ignore
    SDV_AVAILABLE = True
except Exception:
    SDV_AVAILABLE = False


In [None]:
def logistic(x: float) -> float:
    return 1.0 / (1.0 + math.exp(-x))

def apply_label_policy(record: dict) -> dict:
    tug = (8.5 + 0.065 * (record['age_years'] - 65) + 0.085 * (record['bmi'] - 26)
           - 2.2 * (record['gait_speed_m_s'] - 1.0) + 0.2 * max(0, record['postural_sway_cm'] - 2.4)
           + 0.065 * record['dual_task_cost_percent'] + 0.042 * record['fear_of_falling_score']
           + 0.5 * record['past_falls_6mo'] + (2.1 if record['assistive_device'] != 'None' else 0))
    tug += random.gauss(0, 1.1)
    tug = max(6.2, min(37.0, tug))
    record['tug_seconds'] = round(tug, 3)
    high = int(tug >= 13.5 or record['gait_speed_m_s'] < 0.8 or record['past_falls_6mo'] >= 1
               or record['dual_task_cost_percent'] >= 22 or record['assistive_device'] != 'None')
    record['label_high_fall_risk'] = high
    if high:
        record['label_risk_level'] = 'high'
    else:
        moderate = (tug >= 11.2 or record['fear_of_falling_score'] >= 16
                    or record['medication_count'] >= 6 or record['chronic_conditions'] >= 3)
        record['label_risk_level'] = 'moderate' if moderate else 'low'
    return record


In [None]:
if SDV_AVAILABLE:
    print('SDV is available. The fallback synthesizer will still be used to avoid optional pandas dependency in this minimal environment.')

class GaussianCopulaSynthesizer:
    def __init__(self, columns):
        self.columns = columns
        self.means = {col: 0.0 for col in columns}
        self.cov = [[0.0 for _ in columns] for _ in columns]

    def fit(self, data):
        n = len(data)
        for col in self.columns:
            vals = [float(row[col]) for row in data]
            self.means[col] = sum(vals) / n
        for i, ci in enumerate(self.columns):
            for j, cj in enumerate(self.columns):
                total = 0.0
                for row in data:
                    total += (float(row[ci]) - self.means[ci]) * (float(row[cj]) - self.means[cj])
                value = total / n
                if i == j:
                    value += 1e-3
                self.cov[i][j] = value
        self._chol = self._cholesky(self.cov)

    @staticmethod
    def _cholesky(matrix):
        n = len(matrix)
        L = [[0.0] * n for _ in range(n)]
        for i in range(n):
            for j in range(i + 1):
                s = sum(L[i][k] * L[j][k] for k in range(j))
                if i == j:
                    val = matrix[i][i] - s
                    if val < 1e-6:
                        val = 1e-6
                    L[i][j] = math.sqrt(val)
                else:
                    L[i][j] = (matrix[i][j] - s) / L[j][j] if L[j][j] else 0.0
        return L

    def sample(self, num_rows):
        samples = []
        for _ in range(num_rows):
            z = [random.gauss(0, 1) for _ in self.columns]
            values = {}
            for idx, col in enumerate(self.columns):
                mean = self.means[col]
                correlated = mean + sum(self._chol[idx][k] * z[k] for k in range(len(z)))
                values[col] = correlated
            samples.append(values)
        return samples


In [None]:
synthesizer = GaussianCopulaSynthesizer(continuous_cols)
synthesizer.fit(seed_rows)

def clamp(value, lower, upper):
    return max(lower, min(upper, value))

synthetic_records = []
for idx, sampled in enumerate(synthesizer.sample(50000), start=1):
    age = clamp(sampled['age_years'], 55.0, 95.0)
    bmi = clamp(sampled['bmi'], 17.0, 45.0)
    systolic = clamp(sampled['systolic_bp'], 95.0, 195.0)
    gait = clamp(sampled['gait_speed_m_s'], 0.35, 1.8)
    stride = clamp(sampled['stride_length_cm'], 60.0, 150.0)
    sway = clamp(sampled['postural_sway_cm'], 0.4, 7.0)
    med = int(round(clamp(sampled['medication_count'], 0.0, 12.0)))
    chronic = int(round(clamp(sampled['chronic_conditions'], 0.0, 7.0)))
    dual_task = clamp(sampled['dual_task_cost_percent'], 0.0, 55.0)
    fear = int(round(clamp(sampled['fear_of_falling_score'], 0.0, 28.0)))
    muscle = clamp(sampled['muscle_strength_score'], 20.0, 95.0)
    reaction = clamp(sampled['reaction_time_ms'], 300.0, 950.0)
    fall_prob = logistic(-0.95 + 0.08 * (age - 70) - 1.5 * (gait - 0.9) + 0.05 * (dual_task - 12) + 0.12 * max(chronic - 2, 0))
    rand_fall = random.random()
    if rand_fall < fall_prob * 0.6:
        falls = 1
    elif rand_fall < fall_prob * 0.85:
        falls = 2
    elif rand_fall < fall_prob:
        falls = 3
    else:
        falls = 0
    device_score = logistic(-0.8 + 0.07 * (age - 70) - 2.4 * (gait - 0.9) + 0.03 * (dual_task - 12) + 0.6 * falls)
    if random.random() < device_score:
        assistive = 'Walker' if random.random() < 0.38 + 0.12 * falls else 'Cane'
    else:
        assistive = 'None'
    sex_prob = logistic(-0.35 + 0.01 * (age - 70))
    sex = 'Female' if random.random() < sex_prob else 'Male'
    record = {
        'participant_id': f'SYN_{idx:05d}',
        'age_years': round(age, 1),
        'sex': sex,
        'bmi': round(bmi, 1),
        'systolic_bp': round(systolic, 1),
        'gait_speed_m_s': round(gait, 3),
        'stride_length_cm': round(stride, 1),
        'postural_sway_cm': round(sway, 3),
        'medication_count': med,
        'chronic_conditions': chronic,
        'past_falls_6mo': falls,
        'assistive_device': assistive,
        'dual_task_cost_percent': round(dual_task, 2),
        'fear_of_falling_score': fear,
        'muscle_strength_score': round(muscle, 1),
        'reaction_time_ms': round(reaction, 1)
    }
    synthetic_records.append(apply_label_policy(record))

fieldnames = list(synthetic_records[0].keys())
full_path = OUTPUT_DIR / 'fallrisk_tabular_v1.csv'
with full_path.open('w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(synthetic_records)

sample_path = OUTPUT_DIR / 'sample_1k.csv'
with sample_path.open('w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(synthetic_records[:1000])

print(f'Synthesized {len(synthetic_records)} rows -> {full_path.resolve()}')
print(f'Sample preview saved to {sample_path.resolve()}')


In [None]:
high_count = sum(r['label_high_fall_risk'] for r in synthetic_records)
moderate_count = sum(1 for r in synthetic_records if r['label_risk_level'] == 'moderate')
low_count = sum(1 for r in synthetic_records if r['label_risk_level'] == 'low')
print(f'Risk level counts -> high: {high_count}, moderate: {moderate_count}, low: {low_count}')
