In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
# Set random seed for reproducible results
np.random.seed(42)

def simulate_opd_data(num_patients=400):
    """
    Simulate OPD patient data for DGH Negombo
    Peak hours: First 4 hours (8:00 AM - 12:00 PM)
    Total service time: 8 hours (8:00 AM - 4:00 PM)
    """
    # Create base date (today at 8:00 AM)
    base_date = datetime.now().replace(hour=8, minute=0, second=0, microsecond=0)
    
    data = []
    current_time = base_date
    
    for patient_id in range(1, num_patients + 1):
        # Simulate arrival time - more patients during peak hours (first 4 hours)
        if patient_id <= 280:  # 70% of patients in first 4 hours (peak)
            inter_arrival = np.random.exponential(1.0)  # 1.0 minute average during peak
        else:  # Remaining 30% of patients in next 4 hours (off-peak)
            inter_arrival = np.random.exponential(4.0)  # 4.0 minutes average during off-peak
            
        current_time += timedelta(minutes=inter_arrival)
        
        # Stop generating patients after 4:00 PM (8 hours total)
        if current_time.hour >= 16:  # 4:00 PM
            break
        
        # Simulate registration time (1-4 minutes)
        registration_time = np.random.uniform(1, 4)
        
        # Simulate waiting time for doctor (10-120 minutes)
        waiting_time = np.random.normal(45, 20)
        waiting_time = max(5, waiting_time)  # Minimum 5 minutes wait
        
        # Simulate consultation time (3-15 minutes)
        consultation_time = np.random.normal(8, 3)
        consultation_time = max(2, consultation_time)  # Minimum 2 minutes
        
        # Calculate total time in system
        total_time = registration_time + waiting_time + consultation_time
        
        patient_data = {
            'patient_id': patient_id,
            'arrival_time': current_time,
            'registration_duration': round(registration_time, 2),
            'waiting_time': round(waiting_time, 2),
            'consultation_duration': round(consultation_time, 2),
            'total_time_system': round(total_time, 2),
            'age_group': np.random.choice(['Young', 'Adult', 'Elderly'], p=[0.2, 0.5, 0.3]),
            'priority': np.random.choice(['Routine', 'Urgent'], p=[0.85, 0.15])
        }
        
        data.append(patient_data)
    
    return pd.DataFrame(data)


In [3]:
# Generate the dataset
print("Generating DGH Negombo OPD Dataset...")
print("Peak hours: 8:00 AM - 12:00 PM (first 4 hours)")
print("Total service time: 8:00 AM - 4:00 PM (8 hours)")
df = simulate_opd_data(400)

Generating DGH Negombo OPD Dataset...
Peak hours: 8:00 AM - 12:00 PM (first 4 hours)
Total service time: 8:00 AM - 4:00 PM (8 hours)


In [4]:
# Display basic info
print(f"\nDataset created with {len(df)} patients")
print(f"Average waiting time: {df['waiting_time'].mean():.1f} minutes")


Dataset created with 333 patients
Average waiting time: 45.8 minutes


In [5]:
# Save to CSV
output_file = 'dgh_negombo_opd_dataset.csv'
df.to_csv(output_file, index=False)
print(f"\nDataset saved as: {output_file}")


Dataset saved as: dgh_negombo_opd_dataset.csv


In [6]:
# Show first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
   patient_id               arrival_time  registration_duration  waiting_time  \
0           1 2025-10-31 08:00:28.156085                   3.85         57.95   
1           2 2025-10-31 08:00:31.746411                   3.60         76.58   
2           3 2025-10-31 08:02:18.932184                   1.64         35.73   
3           4 2025-10-31 08:02:52.864408                   1.87         24.74   
4           5 2025-10-31 08:03:29.400489                   3.36         46.35   

   consultation_duration  total_time_system age_group priority  
0                  12.57              74.38     Young  Routine  
1                  10.30              90.49     Young   Urgent  
2                   6.60              43.97     Adult  Routine  
3                   8.94              35.56     Adult  Routine  
4                   3.73              53.43     Adult  Routine  
