In [1]:
import pandas as pd
import numpy as np

# Number of participants
num_participants = 300

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
synthetic_data = pd.DataFrame({
    'date': pd.to_datetime(np.random.choice(pd.date_range('2020-01-01', '2023-01-01'), num_participants)),
    'time': pd.to_datetime(np.random.randint(0, 86400, num_participants), unit='s').time,
    'dob': pd.to_datetime(np.random.choice(pd.date_range('1940-01-01', '2010-01-01'), num_participants)),
    'country': np.random.choice(['ZAF', 'USA', 'IND', 'BRA', 'CHN'], num_participants),
    'patient_id': np.random.randint(10000, 99999, num_participants).astype(str),
    'age_enrolment': np.random.normal(40, 15, num_participants).astype(int),
    'race': np.random.choice(['White', 'Black African', 'Asian', 'Mixed', 'Other'], num_participants),
    'sex': np.random.choice(['Male', 'Female'], num_participants),
    'height': np.random.normal(165, 10, num_participants).round(2),
    'weight': np.random.normal(70, 15, num_participants).round(2),
    'bmi': (np.random.normal(25, 5, num_participants)).round(2),
    'study_location': np.random.choice(['Johannesburg', 'Cape Town', 'Pretoria', 'Durban'], num_participants),
    'housing_type': np.random.choice(['Formal', 'Informal', 'Rural', 'Urban'], num_participants),
    'num_people_household': np.random.randint(1, 10, num_participants),
    'aircon_access': np.random.choice(['yes', 'no'], num_participants),
    'income': np.random.normal(2000, 500, num_participants).round(2),
    'house_income': np.random.normal(3000, 800, num_participants).round(2),
    'smoking_status': np.random.choice(['yes', 'no'], num_participants),
    'alcohol_status': np.random.choice(['yes', 'no'], num_participants),
    'employment': np.random.choice(['yes', 'no'], num_participants),
    'education_years': np.random.normal(12, 4, num_participants).astype(int),
    'loss_to_follow_up': np.random.choice(['yes', 'no'], num_participants),
    'hypertension': np.random.choice(['yes', 'no'], num_participants),
    'dm': np.random.choice(['yes', 'no'], num_participants),
    'birth_weight': np.random.normal(3000, 500, num_participants).round(2),
    'hospital_admission': np.random.choice(['yes', 'no'], num_participants),
    'gastroenteritis': np.random.choice(['yes', 'no'], num_participants),
    'pneumonia': np.random.choice(['yes', 'no'], num_participants),
    'urinary_tract_infection': np.random.choice(['yes', 'no'], num_participants),
    'syphilis_status': np.random.choice(['positive', 'negative'], num_participants),
    'hiv_status': np.random.choice(['positive', 'negative'], num_participants),
    'HepB_status': np.random.choice(['positive', 'negative'], num_participants),
    'schistosomiasis_status': np.random.choice(['positive', 'negative'], num_participants),
    'tb_status': np.random.choice(['positive', 'negative'], num_participants),
    'haemoglobin': np.random.normal(13.5, 2, num_participants).round(2),
    'creat': np.random.normal(80, 20, num_participants).round(2),
    'creat_clearance': np.random.normal(90, 30, num_participants).round(2),
    'hiv_vl': np.random.normal(1000, 500, num_participants).round(2),
    'cd4': np.random.normal(500, 200, num_participants).round(2),
    'plt': np.random.normal(250, 75, num_participants).round(2),
    'ast': np.random.normal(35, 10, num_participants).round(2),
    'alt': np.random.normal(40, 12, num_participants).round(2),
    'urine_pcr': np.random.normal(1, 0.3, num_participants).round(2),
    'alp': np.random.normal(75, 25, num_participants).round(2),
    'ggt': np.random.normal(30, 10, num_participants).round(2),
    'mcv': np.random.normal(85, 5, num_participants).round(2),
    'ferritin': np.random.normal(200, 50, num_participants).round(2),
    'mchc': np.random.normal(34, 2, num_participants).round(2),
    'hiv_resistance_mutations': np.random.choice(['M184V', 'K103N', 'None'], num_participants),
    'hiv_drug_resistance': np.random.choice(['positive', 'negative'], num_participants),
    'art_regimen': np.random.choice(['positive', 'negative'], num_participants),
    'art_adherance': np.random.choice(['positive', 'negative'], num_participants),
    'xray_findings': np.random.choice(['Opacity left upper zone', 'Normal'], num_participants),
    'tsh': np.random.normal(2.5, 1.2, num_participants).round(2),
    'lrti': np.random.choice(['positive', 'negative'], num_participants),
    'urti': np.random.choice(['positive', 'negative'], num_participants),
    'direct_bili': np.random.normal(1, 0.5, num_participants).round(2),
    'indirect_bili': np.random.normal(0.7, 0.3, num_participants).round(2),
    'amylase': np.random.normal(90, 30, num_participants).round(2),
    'lipase': np.random.normal(60, 20, num_participants).round(2),
    'cholesterol': np.random.normal(5.2, 1, num_participants).round(2),
    'mch': np.random.normal(30, 5, num_participants).round(2),
    'ldlc': np.random.normal(3, 1, num_participants).round(2),
    'hdlc': np.random.normal(1.5, 0.5, num_participants).round(2),
    'hba1c': np.random.normal(5.5, 1, num_participants).round(2),
    'albumin': np.random.normal(45, 5, num_participants).round(2),
    'cortisol': np.random.normal(350, 100, num_participants).round(2),
    'pth': np.random.normal(5, 2, num_participants).round(2),
    'ft4': np.random.normal(15, 5, num_participants).round(2),
    'ft3': np.random.normal(5, 2, num_participants).round(2),
    'urea': np.random.normal(6, 1.5, num_participants).round(2),
    'calcium': np.random.normal(2.2, 0.2, num_participants).round(2),
    'dexa_scan': np.random.normal(0, 1, num_participants).round(2),
    'uric_acid': np.random.normal(300, 100, num_participants).round(2),
    'potassium': np.random.normal(4, 0.5, num_participants).round(2),
    'sodium': np.random.normal(140, 3, num_participants).round(2),
    'covid_19_pcr': np.random.choice(['positive', 'negative'], num_participants),
    'crp': np.random.normal(5, 3, num_participants).round(2),
    'pct': np.random.normal(0.2, 0.1, num_participants).round(2),
    'il_6': np.random.normal(2.5, 1, num_participants).round(2),
    'inr': np.random.normal(1, 0.2, num_participants).round(2),
    'd_dimers': np.random.normal(0.5, 0.2, num_participants).round(2),
    'depression': np.random.choice(['positive', 'negative'], num_participants),
    'gen_anxiety_disorder': np.random.choice(['positive', 'negative'], num_participants),
    'adverse_event': np.random.choice(['positive', 'negative'], num_participants),
    'heart_rate': np.random.normal(70, 10, num_participants).round(2),
    'systolic_blood_pressure': np.random.normal(120, 15, num_participants).round(2),
    'diastolic_blood_pressure': np.random.normal(80, 10, num_participants).round(2),
    'mean_arterial_pressure': np.random.normal(93, 10, num_participants).round(2),
    'oxygen_saturation': np.random.normal(98, 2, num_participants).round(2)
})

# Save the dataset to an Excel file
synthetic_data.to_excel('synthetic_dataset.xlsx', index=False)

print("Synthetic dataset generated and saved as 'synthetic_dataset.xlsx'.")


Synthetic dataset generated and saved as 'synthetic_dataset.xlsx'.
