Imports.

In [None]:
# cell 1

import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from scipy.optimize import linear_sum_assignment
from scipy.stats import wilcoxon
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)


Data Generation.

In [None]:
# cell 2

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

n_patients = 400
np.random.seed(42)

# 1. Entry Time
entry_time = np.random.uniform(0, 24, size=n_patients)

# 2. Treatment Assignment & Time
treated = np.zeros(n_patients, dtype=int)
treatment_time = np.full(n_patients, np.nan)

treatment_probability = 0.4

for i in range(n_patients):
    if np.random.binomial(1, treatment_probability) == 1:
        treated[i] = 1
        min_treatment_time = entry_time[i] + 3
        max_treatment_time = min_treatment_time + 21
        treatment_time[i] = np.random.uniform(min_treatment_time, max_treatment_time)

# 3. Baseline Symptoms
baseline_pain = np.random.randint(0, 10, size=n_patients)
baseline_urgency = np.random.randint(0, 10, size=n_patients)
baseline_frequency = np.random.randint(0, 21, size=n_patients)

# 4. Symptom Simulation
def simulate_symptom_change(baseline, max_change=3):
    change = np.random.randint(-max_change, max_change + 1)
    new_symptom = baseline + change
    return np.clip(new_symptom, 0, 9)

# Treatment Symptoms
treatment_pain = np.where(treated == 1, [simulate_symptom_change(baseline_pain[i]) for i in range(n_patients)], np.nan)
treatment_urgency = np.where(treated == 1, [simulate_symptom_change(baseline_urgency[i]) for i in range(n_patients)], np.nan)
treatment_frequency = np.where(treated == 1, [simulate_symptom_change(baseline_frequency[i]) for i in range(n_patients)], np.nan)

# Follow-up Symptoms (3m and 6m)
pain_3m = np.full(n_patients, np.nan)
urgency_3m = np.full(n_patients, np.nan)
frequency_3m = np.full(n_patients, np.nan)
pain_6m = np.full(n_patients, np.nan)
urgency_6m = np.full(n_patients, np.nan)
frequency_6m = np.full(n_patients, np.nan)

for i in range(n_patients):
    if treated[i] == 1:
        pain_3m[i] = simulate_symptom_change(treatment_pain[i])
        urgency_3m[i] = simulate_symptom_change(treatment_urgency[i])
        frequency_3m[i] = simulate_symptom_change(treatment_frequency[i])

        pain_6m[i] = simulate_symptom_change(pain_3m[i])
        urgency_6m[i] = simulate_symptom_change(urgency_3m[i])
        frequency_6m[i] = simulate_symptom_change(frequency_3m[i])

# 5. Other Variables (without unobserved factor)
age = np.random.randint(20, 81, size=n_patients)
gender = np.random.choice([0, 1], size=n_patients)

# 6. Create DataFrame (without unobserved factor)
df = pd.DataFrame({
    'entry_time': entry_time,
    'treatment_time': treatment_time,
    'treated': treated,
    'pain_baseline': baseline_pain,
    'urgency_baseline': urgency_baseline,
    'frequency_baseline': frequency_baseline,
    'pain_treatment': treatment_pain,
    'urgency_treatment': urgency_treatment,
    'frequency_treatment': frequency_treatment,
    'pain_3m': pain_3m,
    'urgency_3m': urgency_3m,
    'frequency_3m': frequency_3m,
    'pain_6m': pain_6m,
    'urgency_6m': urgency_6m,
    'frequency_6m': frequency_6m,
    'age': age,
    'gender': gender,
})

# 7. Sort and Standardize
df = df.sort_values(by='entry_time').reset_index(drop=True)

symptom_cols = ['pain_baseline', 'urgency_baseline', 'frequency_baseline']
scaler = StandardScaler()
df[symptom_cols] = scaler.fit_transform(df[symptom_cols])

df.to_csv("synthetic_data.csv", index=False)
df.head()