In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate sample data for CHAI Lung Health Program

# 1. Facilities Dataset
def generate_facilities_data(n_facilities=50):
    counties = ['Nairobi', 'Mombasa', 'Kisumu', 'Nakuru', 'Eldoret', 'Machakos', 'Meru', 'Kakamega', 'Kisii', 'Garissa']
    facility_types = ['Public Hospital', 'Private Hospital', 'Health Center', 'Dispensary', 'Workplace Clinic']
    levels = ['Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6']
    
    facilities = []
    for i in range(n_facilities):
        facility = {
            'facility_id': f'FAC_{i+1:03d}',
            'facility_name': f'Health Facility {i+1}',
            'county': random.choice(counties),
            'sub_county': f'Sub-County {random.randint(1, 5)}',
            'facility_type': random.choice(facility_types),
            'level': random.choice(levels),
            'longitude': round(36.8 + random.uniform(-1, 1), 4),
            'latitude': round(-1.3 + random.uniform(-1, 1), 4),
            'catchment_population': random.randint(5000, 50000),
            'private_sector_partner': random.choice([True, False]),
            'workplace_screening': random.choice([True, False]),
            'date_joined_program': datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365))
        }
        facilities.append(facility)
    
    return pd.DataFrame(facilities)

# 2. Screening Activities Dataset
def generate_screening_data(n_screenings=5000, facilities_df=None):
    conditions = ['TB', 'Asthma', 'COPD', 'Other Respiratory', 'No Condition']
    screening_types = ['Community ACF', 'Workplace Screening', 'Health Facility', 'Mobile Clinic', 'Private Partner']
    risk_groups = ['General Population', 'High-Risk Occupation', 'Urban Slum', 'Mining Community', 'Healthcare Workers', 'Elderly']
    
    screenings = []
    for i in range(n_screenings):
        facility = random.choice(facilities_df['facility_id'].tolist())
        screening_date = datetime(2024, 1, 1) + timedelta(days=random.randint(0, 180))
        age = random.randint(18, 80)
        
        screening = {
            'screening_id': f'SCR_{i+1:05d}',
            'facility_id': facility,
            'patient_id': f'PAT_{random.randint(1000, 9999)}',
            'screening_date': screening_date,
            'screening_type': random.choice(screening_types),
            'age': age,
            'gender': random.choice(['Male', 'Female']),
            'risk_group': random.choice(risk_groups),
            'screened_for_tb': random.choice([True, False]),
            'screened_for_asthma': random.choice([True, False]),
            'screened_for_copd': random.choice([True, False]),
            'symptoms_present': random.choice([True, False]),
            'referred_for_diagnosis': random.choice([True, False]),
            'diagnosis_confirmed': random.choice([True, False]) if random.random() > 0.7 else False,
            'condition_diagnosed': random.choice(conditions) if random.random() > 0.7 else 'No Condition',
            'linked_to_care': random.choice([True, False]) if random.random() > 0.6 else False
        }
        screenings.append(screening)
    
    return pd.DataFrame(screenings)

# 3. Diagnostic Tests Dataset
def generate_diagnostic_data(n_tests=3000, screenings_df=None):
    test_types = ['Xpert MTB/RIF', 'Chest X-ray', 'Spirometry', 'GeneXpert', 'Culture', 'LF-LAM', 'mWRD']
    test_results = ['Positive', 'Negative', 'Invalid', 'Pending']
    conditions = ['TB', 'Asthma', 'COPD', 'Other Respiratory', 'No Condition']
    
    diagnostics = []
    screening_referred = screenings_df[screenings_df['referred_for_diagnosis'] == True]
    
    for i, screening in screening_referred.iterrows():
        if len(diagnostics) >= n_tests:
            break
            
        test_date = screening['screening_date'] + timedelta(days=random.randint(1, 14))
        
        diagnostic = {
            'test_id': f'TEST_{len(diagnostics)+1:05d}',
            'screening_id': screening['screening_id'],
            'facility_id': screening['facility_id'],
            'patient_id': screening['patient_id'],
            'test_date': test_date,
            'test_type': random.choice(test_types),
            'test_result': random.choice(test_results),
            'condition_tested': random.choice(conditions),
            'turnaround_time_days': random.randint(1, 7),
            'digital_reporting': random.choice([True, False]),
            'ai_assisted_diagnosis': random.choice([True, False])
        }
        diagnostics.append(diagnostic)
    
    return pd.DataFrame(diagnostics)

# 4. Treatment and Follow-up Dataset
def generate_treatment_data(diagnostics_df=None, screenings_df=None):
    treatments = []
    positive_cases = diagnostics_df[diagnostics_df['test_result'] == 'Positive']
    
    for i, case in positive_cases.iterrows():
        treatment_start = case['test_date'] + timedelta(days=random.randint(1, 30))
        
        treatment = {
            'treatment_id': f'TRT_{len(treatments)+1:05d}',
            'patient_id': case['patient_id'],
            'facility_id': case['facility_id'],
            'condition': case['condition_tested'],
            'treatment_start_date': treatment_start,
            'treatment_regimen': f'Regimen {random.randint(1, 5)}',
            'treatment_duration_days': random.choice([30, 60, 90, 180, 270]),
            'initial_visit_completed': random.choice([True, False]),
            'follow_up_1_month': random.choice([True, False]),
            'follow_up_3_months': random.choice([True, False]),
            'treatment_outcome': random.choice(['Completed', 'Ongoing', 'Lost to Follow-up', 'Failed']),
            'digital_adherence_tool': random.choice([True, False]),
            'private_sector_referral': random.choice([True, False])
        }
        treatments.append(treatment)
    
    return pd.DataFrame(treatments)

# 5. Private Sector Partnerships Dataset
def generate_partnership_data(n_partnerships=20):
    partnership_types = ['Corporate Screening', 'CSR Initiative', 'Diagnostic Network', 'Health Insurance', 'Workplace Program']
    statuses = ['Active', 'In Progress', 'Completed', 'Planned']
    
    partnerships = []
    for i in range(n_partnerships):
        partnership = {
            'partnership_id': f'PART_{i+1:03d}',
            'partner_name': f'Partner Organization {i+1}',
            'partnership_type': random.choice(partnership_types),
            'start_date': datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365)),
            'end_date': datetime(2024, 12, 31) + timedelta(days=random.randint(-180, 180)),
            'status': random.choice(statuses),
            'screening_target': random.randint(500, 5000),
            'diagnostic_target': random.randint(100, 1000),
            'funding_amount': random.randint(50000, 500000),
            'counties_covered': random.randint(1, 10),
            'workplaces_engaged': random.randint(1, 50)
        }
        partnerships.append(partnership)
    
    return pd.DataFrame(partnerships)

# 6. Program Performance Metrics Dataset
def generate_performance_data(facilities_df):
    months = pd.date_range('2024-01-01', '2024-06-30', freq='M')
    performance_data = []
    
    for facility in facilities_df['facility_id']:
        for month in months:
            performance = {
                'facility_id': facility,
                'month': month,
                'screenings_conducted': random.randint(50, 500),
                'positive_cases_detected': random.randint(5, 50),
                'cases_linked_to_care': random.randint(3, 45),
                'treatment_success_rate': round(random.uniform(0.7, 0.95), 2),
                'screening_yield_rate': round(random.uniform(0.05, 0.2), 3),
                'turnaround_time_avg': round(random.uniform(1, 7), 1),
                'digital_reporting_rate': round(random.uniform(0.6, 0.95), 2),
                'private_sector_engagement_score': random.randint(1, 10)
            }
            performance_data.append(performance)
    
    return pd.DataFrame(performance_data)

# Generate all datasets
print("Generating CHAI Lung Health Program datasets...")

facilities_df = generate_facilities_data(50)
screenings_df = generate_screening_data(5000, facilities_df)
diagnostics_df = generate_diagnostic_data(3000, screenings_df)
treatments_df = generate_treatment_data(diagnostics_df, screenings_df)
partnerships_df = generate_partnership_data(20)
performance_df = generate_performance_data(facilities_df)

# Save to Excel file with multiple sheets
with pd.ExcelWriter('chai_lung_health_data.xlsx') as writer:
    facilities_df.to_excel(writer, sheet_name='Facilities', index=False)
    screenings_df.to_excel(writer, sheet_name='Screening_Activities', index=False)
    diagnostics_df.to_excel(writer, sheet_name='Diagnostic_Tests', index=False)
    treatments_df.to_excel(writer, sheet_name='Treatment_Followup', index=False)
    partnerships_df.to_excel(writer, sheet_name='Private_Partnerships', index=False)
    performance_df.to_excel(writer, sheet_name='Performance_Metrics', index=False)

# Display dataset summaries
print("\nDataset Summary:")
print(f"Facilities: {len(facilities_df)} records")
print(f"Screening Activities: {len(screenings_df)} records")
print(f"Diagnostic Tests: {len(diagnostics_df)} records")
print(f"Treatment Records: {len(treatments_df)} records")
print(f"Private Partnerships: {len(partnerships_df)} records")
print(f"Performance Metrics: {len(performance_df)} records")

# Show sample data from each dataset
print("\nSample Data from Facilities:")
print(facilities_df.head(3))
print("\nSample Data from Screening Activities:")
print(screenings_df.head(3))
print("\nSample Data from Diagnostic Tests:")
print(diagnostics_df.head(3))

print(f"\nAll datasets saved to 'chai_lung_health_data.xlsx'")

Generating CHAI Lung Health Program datasets...


  months = pd.date_range('2024-01-01', '2024-06-30', freq='M')



Dataset Summary:
Facilities: 50 records
Screening Activities: 5000 records
Diagnostic Tests: 2382 records
Treatment Records: 618 records
Private Partnerships: 20 records
Performance Metrics: 300 records

Sample Data from Facilities:
  facility_id      facility_name   county    sub_county     facility_type  \
0     FAC_001  Health Facility 1  Mombasa  Sub-County 1     Health Center   
1     FAC_002  Health Facility 2  Nairobi  Sub-County 1  Private Hospital   
2     FAC_003  Health Facility 3  Garissa  Sub-County 3   Public Hospital   

     level  longitude  latitude  catchment_population  private_sector_partner  \
0  Level 3    36.2464   -0.8271                 49348                    True   
1  Level 3    36.8107   -2.2469                 18031                   False   
2  Level 3    37.1963   -1.6195                 15189                    True   

   workplace_screening date_joined_program  
0                False          2023-01-17  
1                 True          2023-08-18