In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Configuration
np.random.seed(42)  # For reproducibility
n_records = 1_000_000  # Number of records to generate
base_year = 2020  # Base year for temporal data

# Expanded Medical Knowledge Base (50+ Diagnoses)
diagnoses = {
    # Cardiovascular (8)
    'Hypertension': {'category': 'cardiovascular', 'severity': 2},
    'Coronary Artery Disease': {'category': 'cardiovascular', 'severity': 3},
    'Heart Failure': {'category': 'cardiovascular', 'severity': 4},
    'Arrhythmia': {'category': 'cardiovascular', 'severity': 3},
    'Peripheral Artery Disease': {'category': 'cardiovascular', 'severity': 3},
    'Myocardial Infarction': {'category': 'cardiovascular', 'severity': 4},
    'Cardiomyopathy': {'category': 'cardiovascular', 'severity': 4},
    'Atherosclerosis': {'category': 'cardiovascular', 'severity': 3},
    
    # Respiratory (10)
    'Asthma': {'category': 'respiratory', 'severity': 2},
    'COPD': {'category': 'respiratory', 'severity': 3},
    'Pneumonia': {'category': 'respiratory', 'severity': 3},
    'COVID-19': {'category': 'respiratory', 'severity': 3},
    'Tuberculosis': {'category': 'respiratory', 'severity': 4},
    'Lung Cancer': {'category': 'respiratory', 'severity': 4},
    'Pulmonary Embolism': {'category': 'respiratory', 'severity': 4},
    'Bronchitis': {'category': 'respiratory', 'severity': 2},
    'Pulmonary Fibrosis': {'category': 'respiratory', 'severity': 3},
    'Sleep Apnea': {'category': 'respiratory', 'severity': 2},
    
    # Endocrine (8)
    'Diabetes Type 1': {'category': 'endocrine', 'severity': 3},
    'Diabetes Type 2': {'category': 'endocrine', 'severity': 3},
    'Hypothyroidism': {'category': 'endocrine', 'severity': 2},
    'Hyperthyroidism': {'category': 'endocrine', 'severity': 3},
    'Cushing Syndrome': {'category': 'endocrine', 'severity': 3},
    'Addison Disease': {'category': 'endocrine', 'severity': 3},
    'Osteoporosis': {'category': 'endocrine', 'severity': 2},
    'Metabolic Syndrome': {'category': 'endocrine', 'severity': 2},
    
    # Gastrointestinal (8)
    'Gastritis': {'category': 'gastrointestinal', 'severity': 2},
    'GERD': {'category': 'gastrointestinal', 'severity': 2},
    'IBD': {'category': 'gastrointestinal', 'severity': 3},
    'Cirrhosis': {'category': 'gastrointestinal', 'severity': 4},
    'Pancreatitis': {'category': 'gastrointestinal', 'severity': 3},
    'Hepatitis': {'category': 'gastrointestinal', 'severity': 3},
    'Colorectal Cancer': {'category': 'gastrointestinal', 'severity': 4},
    'Gallstones': {'category': 'gastrointestinal', 'severity': 2},
    
    # Neurological (8)
    'Migraine': {'category': 'neurological', 'severity': 2},
    'Epilepsy': {'category': 'neurological', 'severity': 3},
    'Stroke': {'category': 'neurological', 'severity': 4},
    'Alzheimer': {'category': 'neurological', 'severity': 4},
    'Parkinson': {'category': 'neurological', 'severity': 4},
    'Multiple Sclerosis': {'category': 'neurological', 'severity': 3},
    'Neuropathy': {'category': 'neurological', 'severity': 3},
    'Brain Tumor': {'category': 'neurological', 'severity': 4},
    
    # Additional Categories (8+)
    'Chronic Kidney Disease': {'category': 'renal', 'severity': 3},
    'Rheumatoid Arthritis': {'category': 'musculoskeletal', 'severity': 3},
    'Osteoarthritis': {'category': 'musculoskeletal', 'severity': 2},
    'Sepsis': {'category': 'infectious', 'severity': 4},
    'HIV/AIDS': {'category': 'infectious', 'severity': 4},
    'Malaria': {'category': 'infectious', 'severity': 3},
    'Anemia': {'category': 'hematological', 'severity': 2},
    'Leukemia': {'category': 'hematological', 'severity': 4}
}

# Symptom Map
symptom_map = {
    'cardiovascular': [
        'chest pain', 'palpitations', 'shortness of breath',
        'fatigue', 'dizziness', 'leg swelling', 'syncope'
    ],
    'respiratory': [
        'cough', 'dyspnea', 'chest tightness', 'wheezing',
        'sputum production', 'hemoptysis', 'nasal congestion'
    ],
    'endocrine': [
        'fatigue', 'polyuria', 'polydipsia', 'weight changes',
        'heat intolerance', 'cold intolerance', 'skin changes'
    ],
    'gastrointestinal': [
        'abdominal pain', 'nausea', 'vomiting', 'diarrhea',
        'constipation', 'bloating', 'rectal bleeding'
    ],
    'neurological': [
        'headache', 'dizziness', 'numbness', 'tingling',
        'muscle weakness', 'seizures', 'vision changes'
    ],
    'renal': [
        'edema', 'urinary changes', 'foamy urine',
        'flank pain', 'frequency', 'nocturia'
    ],
    'musculoskeletal': [
        'joint pain', 'muscle pain', 'stiffness',
        'reduced mobility', 'swelling', 'redness'
    ],
    'infectious': [
        'fever', 'chills', 'sweating', 'malaise',
        'lymphadenopathy', 'rash', 'weight loss'
    ],
    'hematological': [
        'pallor', 'easy bruising', 'bleeding',
        'petechiae', 'bone pain', 'fatigue'
    ],
    'common': [  # Symptoms that can appear across categories
        'fever', 'fatigue', 'weight loss', 'loss of appetite',
        'sweating', 'malaise', 'weakness'
    ]
}

# Helper Function: Generate Symptoms
def generate_symptoms(category, severity):
    """Generate realistic symptoms based on category and severity"""
    # Base symptoms from category
    base_symptoms = np.random.choice(
        symptom_map[category],
        size=np.random.randint(1, 4),  # 1-3 base symptoms
        replace=False
    ).tolist()
    
    # Add common symptoms with 30% probability
    if np.random.rand() < 0.3:
        common = np.random.choice(symptom_map['common'], 1)
        base_symptoms.extend(common)
    
    # Add severity-specific symptoms
    if severity >= 3:
        base_symptoms.append('severe ' + np.random.choice(base_symptoms))
    
    # Add 5% chance of unrelated symptom
    if np.random.rand() < 0.05:
        other_cat = np.random.choice(list(symptom_map.keys()))
        base_symptoms.append(np.random.choice(symptom_map[other_cat]))
    
    return ', '.join(list(set(base_symptoms)))  # Ensure uniqueness

# Helper Function: Generate Lab Values
def generate_lab_values(diagnosis, category, severity):
    """Generate lab values with disease-specific adjustments"""
    # Base values
    labs = pd.DataFrame({
        'glucose': np.clip(np.random.normal(100, 15, len(diagnosis)), 70, 300),
        'systolic_bp': np.clip(np.random.normal(120, 15, len(diagnosis)), 90, 200),
        'diastolic_bp': np.clip(np.random.normal(80, 10, len(diagnosis)), 60, 120),
        'wbc': np.clip(np.random.lognormal(2, 0.3, len(diagnosis)), 3, 20)
    })
    
    # Disease-specific adjustments
    conditions = {
        'diabetes': (diagnosis.str.contains('Diabetes')),
        'hypertension': (diagnosis == 'Hypertension'),
        'infection': (diagnosis.isin(['Pneumonia', 'COVID-19', 'Sepsis']))
    }
    
    # Apply adjustments using numpy.where
    labs['glucose'] = np.where(conditions['diabetes'],
                              labs['glucose'] * 1.5,
                              labs['glucose'])
    
    labs['systolic_bp'] = np.where(conditions['hypertension'],
                                 labs['systolic_bp'] + 20,
                                 labs['systolic_bp'])
    
    labs['wbc'] = np.where(conditions['infection'],
                          labs['wbc'] * 1.8,
                          labs['wbc'])
    
    return labs.round(1)

# Generate Base Data
df = pd.DataFrame({
    'patient_id': [f'P{str(x).zfill(8)}' for x in range(n_records)],
    'age': np.clip(np.random.normal(50, 15, n_records), 18, 100).astype(int),
    'gender': np.random.choice(['M', 'F'], n_records, p=[0.51, 0.49]),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_records),
    'bmi': np.clip(np.random.normal(25, 5, n_records), 15, 45)
})

# Assign Diagnoses
diag_list = list(diagnoses.keys())
weights = [d['severity']**2 for d in diagnoses.values()]  # Weight by severity^2
df['diagnosis'] = np.random.choice(diag_list, n_records, p=np.array(weights)/sum(weights))

# Add Disease Metadata
meta = pd.DataFrame.from_dict(diagnoses, orient='index').reset_index()
df = df.merge(meta, left_on='diagnosis', right_on='index').drop(columns='index')

# Generate Symptoms
df['symptoms'] = df.apply(
    lambda row: generate_symptoms(row['category'], row['severity']),
    axis=1
)

# Generate Lab Values
labs = generate_lab_values(df['diagnosis'], df['category'], df['severity'])
df = pd.concat([df, labs], axis=1)

# Temporal Data
admit_dates = pd.to_datetime(np.random.choice(
    pd.date_range(f'{base_year}-01-01', f'{base_year+3}-12-31'), n_records
))
df['admit_date'] = admit_dates
df['los'] = np.clip(np.random.poisson(df['severity'] * 2 + np.random.normal(3, 1, n_records)), 1, 30)
df['discharge_date'] = df['admit_date'] + pd.to_timedelta(df['los'], unit='D')

# Medications (Example for 3 categories)
df['medication'] = np.select(
    [
        df['category'] == 'cardiovascular',
        df['category'] == 'respiratory',
        df['category'] == 'endocrine'
    ],
    [
        np.random.choice(['Lisinopril', 'Amlodipine', 'Metoprolol'], n_records),
        np.random.choice(['Albuterol', 'Prednisone', 'Montelukast'], n_records),
        np.random.choice(['Metformin', 'Insulin', 'Levothyroxine'], n_records)
    ],
    default='Other'
)

# Define a function to calculate outcome probabilities based on severity
def calculate_outcome_probs(severity):
    base_probs = np.array([0.5, 0.3, 0.15, 0.04, 0.01])  # Base probabilities
    severity_factor = severity / 4  # Adjust based on severity
    adjusted_probs = base_probs * (1 + severity_factor)  # Scale probabilities
    adjusted_probs /= adjusted_probs.sum()  # Normalize to sum to 1
    return adjusted_probs

# Apply the function to generate outcomes
df['outcome'] = df['severity'].apply(
    lambda s: np.random.choice(
        ['Recovered', 'Stable', 'Deteriorated', 'Critical', 'Deceased'],
        p=calculate_outcome_probs(s)
    )
)

# Save in Parquet format for efficiency
df.to_parquet('million_patients.parquet', engine='pyarrow')
print(f"Generated {len(df):,} records with {len(diagnoses)} diagnoses")

Generated 1,000,000 records with 50 diagnoses


In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Configuration
np.random.seed(42)  # For reproducibility
n_records = 1_000_000  # Number of records to generate
base_year = 2020  # Base year for temporal data

# Expanded Medical Knowledge Base (50+ Diagnoses)
diagnoses = {
    # Cardiovascular (8)
    'Hypertension': {'category': 'cardiovascular', 'severity': 2},
    'Coronary Artery Disease': {'category': 'cardiovascular', 'severity': 3},
    'Heart Failure': {'category': 'cardiovascular', 'severity': 4},
    'Arrhythmia': {'category': 'cardiovascular', 'severity': 3},
    'Peripheral Artery Disease': {'category': 'cardiovascular', 'severity': 3},
    'Myocardial Infarction': {'category': 'cardiovascular', 'severity': 4},
    'Cardiomyopathy': {'category': 'cardiovascular', 'severity': 4},
    'Atherosclerosis': {'category': 'cardiovascular', 'severity': 3},
    
    # Respiratory (10)
    'Asthma': {'category': 'respiratory', 'severity': 2},
    'COPD': {'category': 'respiratory', 'severity': 3},
    'Pneumonia': {'category': 'respiratory', 'severity': 3},
    'COVID-19': {'category': 'respiratory', 'severity': 3},
    'Tuberculosis': {'category': 'respiratory', 'severity': 4},
    'Lung Cancer': {'category': 'respiratory', 'severity': 4},
    'Pulmonary Embolism': {'category': 'respiratory', 'severity': 4},
    'Bronchitis': {'category': 'respiratory', 'severity': 2},
    'Pulmonary Fibrosis': {'category': 'respiratory', 'severity': 3},
    'Sleep Apnea': {'category': 'respiratory', 'severity': 2},
    
    # Endocrine (8)
    'Diabetes Type 1': {'category': 'endocrine', 'severity': 3},
    'Diabetes Type 2': {'category': 'endocrine', 'severity': 3},
    'Hypothyroidism': {'category': 'endocrine', 'severity': 2},
    'Hyperthyroidism': {'category': 'endocrine', 'severity': 3},
    'Cushing Syndrome': {'category': 'endocrine', 'severity': 3},
    'Addison Disease': {'category': 'endocrine', 'severity': 3},
    'Osteoporosis': {'category': 'endocrine', 'severity': 2},
    'Metabolic Syndrome': {'category': 'endocrine', 'severity': 2},
    
    # Gastrointestinal (8)
    'Gastritis': {'category': 'gastrointestinal', 'severity': 2},
    'GERD': {'category': 'gastrointestinal', 'severity': 2},
    'IBD': {'category': 'gastrointestinal', 'severity': 3},
    'Cirrhosis': {'category': 'gastrointestinal', 'severity': 4},
    'Pancreatitis': {'category': 'gastrointestinal', 'severity': 3},
    'Hepatitis': {'category': 'gastrointestinal', 'severity': 3},
    'Colorectal Cancer': {'category': 'gastrointestinal', 'severity': 4},
    'Gallstones': {'category': 'gastrointestinal', 'severity': 2},
    
    # Neurological (8)
    'Migraine': {'category': 'neurological', 'severity': 2},
    'Epilepsy': {'category': 'neurological', 'severity': 3},
    'Stroke': {'category': 'neurological', 'severity': 4},
    'Alzheimer': {'category': 'neurological', 'severity': 4},
    'Parkinson': {'category': 'neurological', 'severity': 4},
    'Multiple Sclerosis': {'category': 'neurological', 'severity': 3},
    'Neuropathy': {'category': 'neurological', 'severity': 3},
    'Brain Tumor': {'category': 'neurological', 'severity': 4},
    
    # Additional Categories (8+)
    'Chronic Kidney Disease': {'category': 'renal', 'severity': 3},
    'Rheumatoid Arthritis': {'category': 'musculoskeletal', 'severity': 3},
    'Osteoarthritis': {'category': 'musculoskeletal', 'severity': 2},
    'Sepsis': {'category': 'infectious', 'severity': 4},
    'HIV/AIDS': {'category': 'infectious', 'severity': 4},
    'Malaria': {'category': 'infectious', 'severity': 3},
    'Anemia': {'category': 'hematological', 'severity': 2},
    'Leukemia': {'category': 'hematological', 'severity': 4}
}

# Symptom Map
symptom_map = {
    'cardiovascular': [
        'chest pain', 'palpitations', 'shortness of breath',
        'fatigue', 'dizziness', 'leg swelling', 'syncope'
    ],
    'respiratory': [
        'cough', 'dyspnea', 'chest tightness', 'wheezing',
        'sputum production', 'hemoptysis', 'nasal congestion'
    ],
    'endocrine': [
        'fatigue', 'polyuria', 'polydipsia', 'weight changes',
        'heat intolerance', 'cold intolerance', 'skin changes'
    ],
    'gastrointestinal': [
        'abdominal pain', 'nausea', 'vomiting', 'diarrhea',
        'constipation', 'bloating', 'rectal bleeding'
    ],
    'neurological': [
        'headache', 'dizziness', 'numbness', 'tingling',
        'muscle weakness', 'seizures', 'vision changes'
    ],
    'renal': [
        'edema', 'urinary changes', 'foamy urine',
        'flank pain', 'frequency', 'nocturia'
    ],
    'musculoskeletal': [
        'joint pain', 'muscle pain', 'stiffness',
        'reduced mobility', 'swelling', 'redness'
    ],
    'infectious': [
        'fever', 'chills', 'sweating', 'malaise',
        'lymphadenopathy', 'rash', 'weight loss'
    ],
    'hematological': [
        'pallor', 'easy bruising', 'bleeding',
        'petechiae', 'bone pain', 'fatigue'
    ],
    'common': [  # Symptoms that can appear across categories
        'fever', 'fatigue', 'weight loss', 'loss of appetite',
        'sweating', 'malaise', 'weakness'
    ]
}

# Helper Function: Generate Symptoms
def generate_symptoms(category, severity):
    """Generate realistic symptoms based on category and severity"""
    # Base symptoms from category
    base_symptoms = np.random.choice(
        symptom_map[category],
        size=np.random.randint(1, 4),  # 1-3 base symptoms
        replace=False
    ).tolist()
    
    # Add common symptoms with 30% probability
    if np.random.rand() < 0.3:
        common = np.random.choice(symptom_map['common'], 1)
        base_symptoms.extend(common)
    
    # Add severity-specific symptoms
    if severity >= 3:
        base_symptoms.append('severe ' + np.random.choice(base_symptoms))
    
    # Add 5% chance of unrelated symptom
    if np.random.rand() < 0.05:
        other_cat = np.random.choice(list(symptom_map.keys()))
        base_symptoms.append(np.random.choice(symptom_map[other_cat]))
    
    return ', '.join(list(set(base_symptoms)))  # Ensure uniqueness

# Helper Function: Generate Lab Values
def generate_lab_values(diagnosis, category, severity):
    """Generate lab values with disease-specific adjustments"""
    # Base values
    labs = pd.DataFrame({
        'glucose': np.clip(np.random.normal(100, 15, len(diagnosis)), 70, 300),
        'systolic_bp': np.clip(np.random.normal(120, 15, len(diagnosis)), 90, 200),
        'diastolic_bp': np.clip(np.random.normal(80, 10, len(diagnosis)), 60, 120),
        'wbc': np.clip(np.random.lognormal(2, 0.3, len(diagnosis)), 3, 20)
    })
    
    # Disease-specific adjustments
    conditions = {
        'diabetes': (diagnosis.str.contains('Diabetes')),
        'hypertension': (diagnosis == 'Hypertension'),
        'infection': (diagnosis.isin(['Pneumonia', 'COVID-19', 'Sepsis']))
    }
    
    # Apply adjustments using numpy.where
    labs['glucose'] = np.where(conditions['diabetes'],
                              labs['glucose'] * 1.5,
                              labs['glucose'])
    
    labs['systolic_bp'] = np.where(conditions['hypertension'],
                                 labs['systolic_bp'] + 20,
                                 labs['systolic_bp'])
    
    labs['wbc'] = np.where(conditions['infection'],
                          labs['wbc'] * 1.8,
                          labs['wbc'])
    
    return labs.round(1)

# Generate Base Data
df = pd.DataFrame({
    'patient_id': [f'P{str(x).zfill(8)}' for x in range(n_records)],
    'age': np.clip(np.random.normal(50, 15, n_records), 18, 100).astype(int),
    'gender': np.random.choice(['M', 'F'], n_records, p=[0.51, 0.49]),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_records),
    'bmi': np.clip(np.random.normal(25, 5, n_records), 15, 45)
})

# Assign Diagnoses
diag_list = list(diagnoses.keys())
weights = [d['severity']**2 for d in diagnoses.values()]  # Weight by severity^2
df['diagnosis'] = np.random.choice(diag_list, n_records, p=np.array(weights)/sum(weights))

# Add Disease Metadata
meta = pd.DataFrame.from_dict(diagnoses, orient='index').reset_index()
df = df.merge(meta, left_on='diagnosis', right_on='index').drop(columns='index')

# Generate Symptoms
df['symptoms'] = df.apply(
    lambda row: generate_symptoms(row['category'], row['severity']),
    axis=1
)

# Generate Lab Values
labs = generate_lab_values(df['diagnosis'], df['category'], df['severity'])
df = pd.concat([df, labs], axis=1)

# Temporal Data
admit_dates = pd.to_datetime(np.random.choice(
    pd.date_range(f'{base_year}-01-01', f'{base_year+3}-12-31'), n_records
))
df['admit_date'] = admit_dates
df['los'] = np.clip(np.random.poisson(df['severity'] * 2 + np.random.normal(3, 1, n_records)), 1, 30)
df['discharge_date'] = df['admit_date'] + pd.to_timedelta(df['los'], unit='D')

# Medications (Example for 3 categories)
df['medication'] = np.select(
    [
        df['category'] == 'cardiovascular',
        df['category'] == 'respiratory',
        df['category'] == 'endocrine'
    ],
    [
        np.random.choice(['Lisinopril', 'Amlodipine', 'Metoprolol'], n_records),
        np.random.choice(['Albuterol', 'Prednisone', 'Montelukast'], n_records),
        np.random.choice(['Metformin', 'Insulin', 'Levothyroxine'], n_records)
    ],
    default='Other'
)

# Define a function to calculate outcome probabilities based on severity
def calculate_outcome_probs(severity):
    base_probs = np.array([0.5, 0.3, 0.15, 0.04, 0.01])  # Base probabilities
    severity_factor = severity / 4  # Adjust based on severity
    adjusted_probs = base_probs * (1 + severity_factor)  # Scale probabilities
    adjusted_probs /= adjusted_probs.sum()  # Normalize to sum to 1
    return adjusted_probs

# Apply the function to generate outcomes
df['outcome'] = df['severity'].apply(
    lambda s: np.random.choice(
        ['Recovered', 'Stable', 'Deteriorated', 'Critical', 'Deceased'],
        p=calculate_outcome_probs(s)
    )
)

# Save in Parquet format for efficiency
df.to_csv('million_patients.csv', index=False)
print(f"Generated {len(df):,} records with {len(diagnoses)} diagnoses")

Generated 1,000,000 records with 50 diagnoses
