In [1]:
%pip install faker

Collecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m1.5/1.9 MB[0m [31m46.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.6.0


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from faker import Faker
from scipy import stats

# Initialize faker for realistic names
fake = Faker()
np.random.seed(42)
random.seed(42)

# Configuration
NUM_PATIENTS = 5000  # Large dataset for robust modeling
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2023, 12, 31)
DATA_DURATION = (END_DATE - START_DATE).days

print("Generating PERFECT synthetic patient data for hackathon...")

# Define clinical parameters
conditions = ['Diabetes', 'Heart Failure', 'Hypertension', 'COPD', 'Obesity', 'Chronic Kidney Disease']
genders = ['M', 'F']
races = ['White', 'Black', 'Hispanic', 'Asian', 'Other']
smoking_statuses = ['Never', 'Former', 'Current']
alcohol_statuses = ['Never', 'Occasional', 'Regular']
activity_levels = ['Sedentary', 'Light', 'Moderate', 'Active']

medications = {
    'Diabetes': ['Metformin', 'Insulin', 'Glipizide', 'Empagliflozin', 'Sitagliptin'],
    'Heart Failure': ['Lisinopril', 'Carvedilol', 'Furosemide', 'Spironolactone', 'Digoxin'],
    'Hypertension': ['Amlodipine', 'Losartan', 'Hydrochlorothiazide', 'Metoprolol', 'Valsartan'],
    'COPD': ['Albuterol', 'Tiotropium', 'Fluticasone', 'Prednisone', 'Salmeterol'],
    'Obesity': ['Orlistat', 'Phentermine', 'Liraglutide', 'Metformin', 'Semaglutide'],
    'Chronic Kidney Disease': ['Enalapril', 'Losartan', 'Furosemide', 'Atorvastatin', 'Sevelamer']
}

comorbidities_list = [
    'Hypertension', 'Hyperlipidemia', 'Coronary Artery Disease',
    'Atrial Fibrillation', 'Sleep Apnea', 'Depression', 'Anxiety',
    'Osteoarthritis', 'Asthma', 'Hypothyroidism'
]

# Create patients dataframe with enhanced demographics
patients = []
for i in range(1, NUM_PATIENTS + 1):
    patient_id = f"P{str(i).zfill(5)}"
    age = int(np.random.normal(62, 14))
    age = max(25, min(age, 95))
    gender = random.choice(genders)
    race = random.choice(races)
    condition = random.choice(conditions)

    # Generate height and weight with realistic distributions
    if gender == 'M':
        height_cm = np.random.normal(175, 7)
        base_weight = np.random.normal(85, 15)
    else:
        height_cm = np.random.normal(162, 6)
        base_weight = np.random.normal(72, 14)

    height_m = height_cm / 100
    baseline_bmi = base_weight / (height_m ** 2)

    # Set baseline values based on condition
    if condition == 'Diabetes':
        base_systolic = np.random.normal(142, 12)
        base_diastolic = np.random.normal(88, 8)
        base_hba1c = np.random.normal(7.8, 1.6)
        base_ldl = np.random.normal(115, 28)
        base_hdl = np.random.normal(42, 9)
        base_triglycerides = np.random.normal(175, 45)
        base_creatinine = np.random.normal(1.2, 0.4)
    elif condition == 'Heart Failure':
        base_systolic = np.random.normal(128, 18)
        base_diastolic = np.random.normal(78, 10)
        base_hba1c = np.random.normal(6.4, 1.1)
        base_ldl = np.random.normal(105, 25)
        base_hdl = np.random.normal(38, 8)
        base_triglycerides = np.random.normal(155, 40)
        base_creatinine = np.random.normal(1.4, 0.5)
    elif condition == 'Hypertension':
        base_systolic = np.random.normal(152, 14)
        base_diastolic = np.random.normal(92, 9)
        base_hba1c = np.random.normal(6.1, 0.9)
        base_ldl = np.random.normal(125, 32)
        base_hdl = np.random.normal(48, 11)
        base_triglycerides = np.random.normal(180, 48)
        base_creatinine = np.random.normal(1.1, 0.3)
    elif condition == 'COPD':
        base_systolic = np.random.normal(136, 14)
        base_diastolic = np.random.normal(82, 9)
        base_hba1c = np.random.normal(6.3, 1.0)
        base_ldl = np.random.normal(108, 26)
        base_hdl = np.random.normal(45, 10)
        base_triglycerides = np.random.normal(160, 42)
        base_creatinine = np.random.normal(1.0, 0.3)
    elif condition == 'Obesity':
        base_systolic = np.random.normal(146, 13)
        base_diastolic = np.random.normal(90, 8)
        base_hba1c = np.random.normal(7.2, 1.4)
        base_ldl = np.random.normal(128, 30)
        base_hdl = np.random.normal(36, 7)
        base_triglycerides = np.random.normal(190, 50)
        base_creatinine = np.random.normal(1.2, 0.3)
    else:  # Chronic Kidney Disease
        base_systolic = np.random.normal(144, 15)
        base_diastolic = np.random.normal(86, 9)
        base_hba1c = np.random.normal(6.7, 1.2)
        base_ldl = np.random.normal(112, 27)
        base_hdl = np.random.normal(41, 9)
        base_triglycerides = np.random.normal(170, 44)
        base_creatinine = np.random.normal(2.1, 1.0)

    # Ensure values stay within reasonable bounds
    base_systolic = max(90, min(base_systolic, 200))
    base_diastolic = max(60, min(base_diastolic, 120))
    base_hba1c = max(4.5, min(base_hba1c, 14.0))
    base_ldl = max(40, min(base_ldl, 250))
    base_hdl = max(20, min(base_hdl, 100))
    base_triglycerides = max(50, min(base_triglycerides, 400))
    base_creatinine = max(0.5, min(base_creatinine, 5.0))
    baseline_bmi = max(18, min(baseline_bmi, 50))

    # Generate comorbidities (2-4 per patient)
    num_comorbidities = np.random.randint(2, 5)
    comorbidities = random.sample(comorbidities_list, num_comorbidities)

    patients.append({
        'patient_id': patient_id,
        'first_name': fake.first_name(),
        'last_name': fake.last_name(),
        'age': age,
        'gender': gender,
        'race': race,
        'condition': condition,
        'comorbidities': '|'.join(comorbidities),
        'height_cm': round(height_cm, 1),
        'baseline_weight_kg': round(base_weight, 1),
        'baseline_bmi': round(baseline_bmi, 1),
        'baseline_systolic_bp': round(base_systolic),
        'baseline_diastolic_bp': round(base_diastolic),
        'baseline_hba1c': round(base_hba1c, 1),
        'baseline_ldl': round(base_ldl),
        'baseline_hdl': round(base_hdl),
        'baseline_triglycerides': round(base_triglycerides),
        'baseline_creatinine': round(base_creatinine, 2),
        'smoking_status': random.choice(smoking_statuses),
        'alcohol_status': random.choice(alcohol_statuses),
        'activity_level': random.choice(activity_levels),
        'primary_medication': random.choice(medications[condition]),
        'socioeconomic_status': random.choice(['Low', 'Middle', 'High']),
        'insurance_type': random.choice(['Private', 'Medicare', 'Medicaid', 'Uninsured'])
    })

patients_df = pd.DataFrame(patients)
print(f"Created {len(patients_df)} patient records")

# Generate historical time-series data
historical_data = []
outcomes = []

print("Generating historical time-series data...")

for _, patient in patients_df.iterrows():
    patient_id = patient['patient_id']
    condition = patient['condition']
    base_weight = patient['baseline_weight_kg']
    base_systolic = patient['baseline_systolic_bp']
    base_diastolic = patient['baseline_diastolic_bp']
    base_hba1c = patient['baseline_hba1c']
    base_ldl = patient['baseline_ldl']
    base_hdl = patient['baseline_hdl']
    base_triglycerides = patient['baseline_triglycerides']
    base_creatinine = patient['baseline_creatinine']

    # Determine if this patient will deteriorate (25-40% will deteriorate based on condition)
    condition_risk = {
        'Diabetes': 0.35,
        'Heart Failure': 0.40,
        'Hypertension': 0.30,
        'COPD': 0.35,
        'Obesity': 0.25,
        'Chronic Kidney Disease': 0.38
    }

    will_deteriorate = np.random.random() < condition_risk[condition]

    # If patient will deteriorate, determine when (in the last 1/3 of the period)
    if will_deteriorate:
        deterioration_start = START_DATE + timedelta(days=int(DATA_DURATION * 2/3 + np.random.random() * DATA_DURATION/3))
        # Add some randomness to deterioration pattern
        deter_pattern = random.choice(['gradual', 'rapid', 'fluctuating', 'stepwise'])
    else:
        deterioration_start = None
        deter_pattern = None

    # Initialize current values
    current_date = START_DATE
    current_weight = base_weight
    current_systolic = base_systolic
    current_diastolic = base_diastolic
    current_hr = np.random.normal(75, 8)
    current_hba1c = base_hba1c
    current_ldl = base_ldl
    current_hdl = base_hdl
    current_triglycerides = base_triglycerides
    current_creatinine = base_creatinine
    current_spo2 = np.random.normal(97, 1.5)  # Blood oxygen saturation

    # Set deterioration factors if patient will deteriorate
    if will_deteriorate:
        if deter_pattern == 'gradual':
            systolic_trend = 0.12  # gradual increase
            diastolic_trend = 0.08
            hba1c_trend = 0.07
            creatinine_trend = 0.04
            weight_trend = 0.08
            spo2_trend = -0.03
        elif deter_pattern == 'rapid':
            systolic_trend = 0.25  # rapid increase
            diastolic_trend = 0.18
            hba1c_trend = 0.15
            creatinine_trend = 0.10
            weight_trend = 0.20
            spo2_trend = -0.08
        elif deter_pattern == 'fluctuating':
            systolic_trend = 0.18  # fluctuating but overall increase
            diastolic_trend = 0.12
            hba1c_trend = 0.10
            creatinine_trend = 0.06
            weight_trend = 0.15
            spo2_trend = -0.05
        else:  # stepwise
            systolic_trend = 0.22  # stepwise deterioration
            diastolic_trend = 0.15
            hba1c_trend = 0.12
            creatinine_trend = 0.08
            weight_trend = 0.18
            spo2_trend = -0.06
    else:
        # Stable or slightly improving trends
        systolic_trend = -0.04  # slight improvement
        diastolic_trend = -0.03
        hba1c_trend = -0.02
        creatinine_trend = -0.01
        weight_trend = -0.03
        spo2_trend = 0.01

    # Generate daily data
    day_count = 0
    while current_date <= END_DATE:
        # Check if we're in the deterioration period
        in_deterioration = will_deteriorate and (current_date >= deterioration_start)

        # Calculate medication adherence (worse if deteriorating)
        if in_deterioration:
            adherence = max(0.3, min(1.0, np.random.normal(0.65, 0.18)))
        else:
            adherence = max(0.5, min(1.0, np.random.normal(0.82, 0.12)))

        # Add realistic noise to vitals
        weight_noise = np.random.normal(0, 0.5)
        systolic_noise = np.random.normal(0, 5)
        diastolic_noise = np.random.normal(0, 4)
        hr_noise = np.random.normal(0, 3)
        spo2_noise = np.random.normal(0, 1)

        # Apply trends if in deterioration period
        if in_deterioration:
            weight_today = current_weight + weight_trend + weight_noise
            systolic_today = current_systolic + systolic_trend + systolic_noise
            diastolic_today = current_diastolic + diastolic_trend + diastolic_noise
            hr_today = current_hr + np.random.normal(3, 2) + hr_noise  # HR increases during deterioration
            spo2_today = current_spo2 + spo2_trend + spo2_noise
        else:
            weight_today = current_weight + weight_noise
            systolic_today = current_systolic + systolic_noise
            diastolic_today = current_diastolic + diastolic_noise
            hr_today = current_hr + hr_noise
            spo2_today = current_spo2 + spo2_noise

        # Ensure values stay within reasonable bounds
        weight_today = max(40, min(weight_today, 180))
        systolic_today = max(80, min(systolic_today, 220))
        diastolic_today = max(50, min(diastolic_today, 130))
        hr_today = max(45, min(hr_today, 140))
        spo2_today = max(85, min(spo2_today, 100))

        # Only add lab values occasionally (not every day)
        has_labs = False
        hba1c_today = None
        ldl_today = None
        hdl_today = None
        triglycerides_today = None
        creatinine_today = None
        alt_today = None  # Liver enzyme
        sodium_today = None  # Electrolyte

        # Labs are measured less frequently (approx every 30 days)
        if day_count % 30 == 0 or (in_deterioration and day_count % 14 == 0):  # more frequent labs if deteriorating
            has_labs = True
            lab_noise = np.random.normal(0, 0.2)

            if in_deterioration:
                hba1c_today = current_hba1c + hba1c_trend + lab_noise
                creatinine_today = current_creatinine + creatinine_trend + lab_noise * 0.1

                # Lipids and other markers might also worsen during deterioration
                ldl_today = current_ldl + np.random.normal(8, 4)
                hdl_today = current_hdl + np.random.normal(-3, 1.5)
                triglycerides_today = current_triglycerides + np.random.normal(15, 6)
                alt_today = np.random.normal(45, 15)  # Elevated liver enzymes during deterioration
                sodium_today = np.random.normal(135, 3)  # Electrolyte imbalance
            else:
                hba1c_today = current_hba1c + lab_noise * 0.5
                creatinine_today = current_creatinine + lab_noise * 0.05
                ldl_today = current_ldl + np.random.normal(0, 4)
                hdl_today = current_hdl + np.random.normal(0, 2)
                triglycerides_today = current_triglycerides + np.random.normal(0, 6)
                alt_today = np.random.normal(25, 8)  # Normal liver enzymes
                sodium_today = np.random.normal(140, 2)  # Normal electrolyte levels

            # Ensure lab values stay within reasonable bounds
            hba1c_today = max(4.0, min(hba1c_today, 15.0)) if hba1c_today else None
            creatinine_today = max(0.5, min(creatinine_today, 5.0)) if creatinine_today else None
            ldl_today = max(40, min(ldl_today, 250)) if ldl_today else None
            hdl_today = max(20, min(hdl_today, 100)) if hdl_today else None
            triglycerides_today = max(50, min(triglycerides_today, 400)) if triglycerides_today else None
            alt_today = max(10, min(alt_today, 150)) if alt_today else None
            sodium_today = max(125, min(sodium_today, 150)) if sodium_today else None

            # Update current lab values for trend continuation
            if hba1c_today:
                current_hba1c = hba1c_today
            if creatinine_today:
                current_creatinine = creatinine_today
            if ldl_today:
                current_ldl = ldl_today
            if hdl_today:
                current_hdl = hdl_today
            if triglycerides_today:
                current_triglycerides = triglycerides_today

        # Add lifestyle factors
        if day_count % 7 == 0:  # Weekly lifestyle updates
            if in_deterioration:
                steps = np.random.normal(3500, 1500)  # Reduced activity during deterioration
                sleep_hours = np.random.normal(6.5, 1.5)  # Poor sleep during deterioration
                stress_level = np.random.randint(6, 10)  # High stress (1-10 scale)
            else:
                steps = np.random.normal(6500, 2000)  # Normal activity
                sleep_hours = np.random.normal(7.5, 1.0)  # Normal sleep
                stress_level = np.random.randint(3, 7)  # Moderate stress
        else:
            steps = None
            sleep_hours = None
            stress_level = None

        # Add the daily record
        historical_data.append({
            'patient_id': patient_id,
            'date': current_date,
            'weight_kg': round(weight_today, 1),
            'systolic_bp': int(systolic_today),
            'diastolic_bp': int(diastolic_today),
            'heart_rate': int(hr_today),
            'spo2': round(spo2_today, 1),
            'hba1c': round(hba1c_today, 1) if hba1c_today else None,
            'cholesterol_ldl': int(ldl_today) if ldl_today else None,
            'cholesterol_hdl': int(hdl_today) if hdl_today else None,
            'triglycerides': int(triglycerides_today) if triglycerides_today else None,
            'serum_creatinine': round(creatinine_today, 2) if creatinine_today else None,
            'alt': round(alt_today, 1) if alt_today else None,
            'sodium': round(sodium_today, 1) if sodium_today else None,
            'medication_adherence': round(adherence, 2),
            'steps': int(steps) if steps else None,
            'sleep_hours': round(sleep_hours, 1) if sleep_hours else None,
            'stress_level': stress_level
        })

        # Update current values for next day
        current_weight = weight_today
        current_systolic = systolic_today
        current_diastolic = diastolic_today
        current_hr = hr_today
        current_spo2 = spo2_today

        # Move to next day
        current_date += timedelta(days=1)
        day_count += 1

    # Add outcome for this patient
    outcomes.append({
        'patient_id': patient_id,
        'index_date': END_DATE - timedelta(days=90),  # Predict from this date forward
        'deterioration_date': deterioration_start if will_deteriorate else None,
        'deterioration_type': deter_pattern if will_deteriorate else 'none',
        'deterioration_label': 1 if will_deteriorate else 0
    })

# Create DataFrames
historical_df = pd.DataFrame(historical_data)
outcomes_df = pd.DataFrame(outcomes)

# Forward-fill lab values to simulate real-world data
lab_columns = ['hba1c', 'cholesterol_ldl', 'cholesterol_hdl', 'triglycerides',
               'serum_creatinine', 'alt', 'sodium']
for col in lab_columns:
    historical_df[col] = historical_df.groupby('patient_id')[col].ffill()

# Forward-fill lifestyle data weekly
lifestyle_columns = ['steps', 'sleep_hours', 'stress_level']
for col in lifestyle_columns:
    historical_df[col] = historical_df.groupby('patient_id')[col].ffill(limit=7)

# Save to CSV files
patients_df.to_csv('patients.csv', index=False)
historical_df.to_csv('historical_data.csv', index=False)
outcomes_df.to_csv('outcomes.csv', index=False)

print("Data generation complete!")
print(f"Patients: {len(patients_df)}")
print(f"Historical records: {len(historical_df)}")
print(f"Patients with deterioration: {outcomes_df['deterioration_label'].sum()}")
print("\nFiles saved:")
print("- patients.csv")
print("- historical_data.csv")
print("- outcomes.csv")
print("\nFeature overview:")
print("1. Demographics: age, gender, race, socioeconomic status, insurance")
print("2. Clinical measurements: vitals, labs, med adherence")
print("3. Lifestyle data: steps, sleep, stress levels")
print("4. Medical history: conditions, comorbidities")
print("5. Temporal patterns with realistic deterioration trajectories")

Generating PERFECT synthetic patient data for hackathon...
Created 5000 patient records
Generating historical time-series data...
Data generation complete!
Patients: 5000
Historical records: 7305000
Patients with deterioration: 1685

Files saved:
- patients.csv
- historical_data.csv
- outcomes.csv

Feature overview:
1. Demographics: age, gender, race, socioeconomic status, insurance
2. Clinical measurements: vitals, labs, med adherence
3. Lifestyle data: steps, sleep, stress levels
4. Medical history: conditions, comorbidities
5. Temporal patterns with realistic deterioration trajectories
