In [1]:
from faker import Faker
import random
import pandas as pd
from datetime import datetime, timedelta

# Initialize Faker instance
fake = Faker()

# Realistic lists for diagnosis, medications, and treatments
diagnoses = ['Diabetes', 'Hypertension', 'Asthma', 'Heart Disease', 'Obesity', 'Cancer', 'Chronic Kidney Disease', 'None']
medications = {
    'Diabetes': ['Metformin', 'Insulin', 'Glimepiride', 'Glyburide'],
    'Hypertension': ['Lisinopril', 'Amlodipine', 'Losartan', 'Hydrochlorothiazide'],
    'Asthma': ['Albuterol', 'Fluticasone', 'Montelukast'],
    'Heart Disease': ['Aspirin', 'Statins', 'Beta-blockers', 'ACE Inhibitors'],
    'Obesity': ['Orlistat', 'Phentermine'],
    'Cancer': ['Chemotherapy', 'Tamoxifen', 'Immunotherapy'],
    'Chronic Kidney Disease': ['Angiotensin Inhibitors', 'Diuretics', 'Statins'],
    'None': ['None']
}
treatments = ['Surgery', 'Chemotherapy', 'Physiotherapy', 'Medication', 'Radiotherapy', 'Dialysis']
ethnicities = ['Caucasian', 'African American', 'Asian', 'Hispanic', 'Middle Eastern', 'Other']
occupations = ['Software Engineer', 'Teacher', 'Doctor', 'Engineer', 'Nurse', 'Artist', 'Unemployed']

def generate_data(num_patients=5000, num_doctors=50, num_appointments=10000):
    patients = []
    doctors = []
    appointments = []
    prescriptions = []
    medical_history = []
    treatments_data = []
    doctor_patient_link = []
    billing = []
    
    # Generate doctors data with more realistic information (location and specialties)
    doctor_locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', 'Dallas', 'San Francisco']
    for doctor_id in range(1, num_doctors + 1):
        doctor_first_name = fake.first_name()
        doctor_last_name = fake.last_name()
        specialization = random.choice(['Cardiologist', 'Neurologist', 'General Practitioner', 'Orthopedist', 'Pediatrician', 'Oncologist'])
        doctor_phone = fake.phone_number()
        doctor_email = fake.email()
        availability = random.choice(['9 AM - 5 PM', '8 AM - 4 PM', '10 AM - 6 PM'])
        location = random.choice(doctor_locations)

        doctor = {
            "Doctor ID": doctor_id,
            "First Name": doctor_first_name,
            "Last Name": doctor_last_name,
            "Specialization": specialization,
            "Phone": doctor_phone,
            "Email": doctor_email,
            "Availability": availability,
            "Location": location
        }
        doctors.append(doctor)

    # Generate patients data with additional demographics and more diversity
    for patient_id in range(1, num_patients + 1):
        first_name = fake.first_name()
        last_name = fake.last_name()
        dob = fake.date_of_birth(minimum_age=18, maximum_age=90)
        gender = random.choice(['Male', 'Female', 'Other'])
        address = fake.address()
        phone = fake.phone_number()
        email = fake.email()
        insurance_provider = random.choice(['Aetna', 'Blue Cross', 'UnitedHealth', 'Cigna', 'None'])
        marital_status = random.choice(['Single', 'Married', 'Divorced'])
        ethnicity = random.choice(ethnicities)
        occupation = random.choice(occupations)

        diagnosis = random.choice(diagnoses)
        treatment_history = random.choice(treatments)
        allergies = random.choice(['Penicillin', 'Peanuts', 'None'])
        chronic_conditions = random.choice(['Yes', 'No'])
        medications_list = medications[diagnosis] if diagnosis != 'None' else ['None']
        prescribed_medication = random.choice(medications_list)

        patient = {
            "Patient ID": patient_id,
            "First Name": first_name,
            "Last Name": last_name,
            "Date of Birth": dob,
            "Gender": gender,
            "Address": address,
            "Phone": phone,
            "Email": email,
            "Insurance Provider": insurance_provider,
            "Marital Status": marital_status,
            "Ethnicity": ethnicity,
            "Occupation": occupation,
            "Diagnosis": diagnosis,
            "Treatment History": treatment_history,
            "Allergies": allergies,
            "Chronic Conditions": chronic_conditions,
            "Medications": prescribed_medication
        }
        patients.append(patient)

        # Generate appointment data with more variability and realistic dates
        num_patient_appointments = random.randint(1, 5)
        for _ in range(num_patient_appointments):
            appointment_id = fake.unique.random_number(digits=6)
            appointment_date = fake.date_this_decade()
            doctor_id = random.randint(1, num_doctors)
            appointment_status = random.choice(['Scheduled', 'Completed', 'Canceled'])
            reason_for_appointment = random.choice(['Routine Checkup', 'Emergency', 'Follow-up', 'Consultation'])

            appointment = {
                "Appointment ID": appointment_id,
                "Patient ID": patient_id,
                "Doctor ID": doctor_id,
                "Appointment Date": appointment_date,
                "Appointment Status": appointment_status,
                "Reason for Appointment": reason_for_appointment
            }
            appointments.append(appointment)

            # Generate prescriptions with realistic medication dosages and time periods
            prescription_id = fake.unique.random_number(digits=6)
            medication_name = prescribed_medication
            dosage = random.choice(['10mg', '50mg', '100mg', '1 dose per day', 'None'])
            start_date = fake.date_this_year()
            end_date = fake.date_this_year() if random.choice([True, False]) else None
            prescription_notes = random.choice(['Take after meal', 'Take before bed', 'None'])

            prescription = {
                "Prescription ID": prescription_id,
                "Patient ID": patient_id,
                "Doctor ID": doctor_id,
                "Medication Name": medication_name,
                "Dosage": dosage,
                "Start Date": start_date,
                "End Date": end_date,
                "Prescription Notes": prescription_notes
            }
            prescriptions.append(prescription)

            # Generate treatment data with more complexity
            treatment_id = fake.unique.random_number(digits=6)
            treatment_date = fake.date_this_year()
            treatment_type = random.choice(treatments)
            treatment_outcome = random.choice(['Success', 'Failure', 'Ongoing'])

            treatment = {
                "Treatment ID": treatment_id,
                "Patient ID": patient_id,
                "Doctor ID": doctor_id,
                "Treatment Date": treatment_date,
                "Treatment Type": treatment_type,
                "Treatment Outcome": treatment_outcome
            }
            treatments_data.append(treatment)

            # Generate doctor-patient relationship data with realistic dates
            doctor_patient_id = fake.unique.random_number(digits=6)
            relationship_start_date = fake.date_this_year()
            doctor_patient_relationship = {
                "Doctor-Patient Link ID": doctor_patient_id,
                "Patient ID": patient_id,
                "Doctor ID": doctor_id,
                "Start Date of Relationship": relationship_start_date
            }
            doctor_patient_link.append(doctor_patient_relationship)

            # Generate billing data with multiple statuses and varying amounts
            billing_id = fake.unique.random_number(digits=6)
            total_bill_amount = random.uniform(50, 5000)
            payment_status = random.choice(['Paid', 'Pending', 'Insurance Covered'])
            payment_date = fake.date_this_year() if payment_status == 'Paid' else None

            bill = {
                "Billing ID": billing_id,
                "Patient ID": patient_id,
                "Total Bill Amount": total_bill_amount,
                "Insurance Provider": insurance_provider,
                "Payment Status": payment_status,
                "Payment Date": payment_date
            }
            billing.append(bill)

        # Generate medical history with more specific details
        previous_diagnoses = random.sample(diagnoses, k=random.randint(1, 3))
        previous_treatments = random.sample(treatments, k=random.randint(1, 2))
        previous_medications = medications[diagnosis] if diagnosis != 'None' else ['None']
        previous_medications_taken = random.sample(previous_medications, k=random.randint(0, len(previous_medications)))

        medical_notes = random.choice(['Patient has a family history of diabetes.', 'Patient had surgery 5 years ago.', 'Patient regularly exercises.'])

        medical_history_entry = {
            "Patient ID": patient_id,
            "Previous Diagnoses": ', '.join(previous_diagnoses),
            "Previous Treatments": ', '.join(previous_treatments),
            "Previous Medications": ', '.join(previous_medications_taken),
            "Medical Notes": medical_notes
        }
        medical_history.append(medical_history_entry)

    # Return all dataframes
    return {
        "patients": pd.DataFrame(patients),
        "doctors": pd.DataFrame(doctors),
        "appointments": pd.DataFrame(appointments),
        "prescriptions": pd.DataFrame(prescriptions),
        "medical_history": pd.DataFrame(medical_history),
        "treatments": pd.DataFrame(treatments_data),
        "doctor_patient_link": pd.DataFrame(doctor_patient_link),
        "billing": pd.DataFrame(billing)
    }

# Generate synthetic data
num_patients = 5000
num_doctors = 50
data = generate_data(num_patients, num_doctors)

# Saving to CSV files
for table_name, df in data.items():
    df.to_csv(f"{table_name}.csv", index=False)

# Display sample data for patients
print("Patient Data Sample:")
print(data['patients'].head())


Patient Data Sample:
   Patient ID First Name  Last Name Date of Birth  Gender  \
0           1    Stephen     Fields    1987-08-15   Other   
1           2     Cheryl     Miller    2002-11-03    Male   
2           3       Jose    Sanders    1945-07-23   Other   
3           4     Joseph   Thompson    1950-10-15    Male   
4           5      Tasha  Henderson    1994-04-15  Female   

                                             Address                   Phone  \
0       928 Garcia Creek\nPort Brandonbury, CT 51753      (825)224-2663x9744   
1  427 Nicholas Ranch Apt. 591\nNew Lorrainefurt,...              3807195360   
2         74080 Mcdonald Row\nSouth Amanda, NH 72319      (224)940-9747x4387   
3     906 Hernandez Crest\nPort Jameshaven, ND 65764           (377)324-5054   
4  2444 Kimberly Skyway Suite 408\nLeborough, AZ ...  001-565-446-5522x82049   

                        Email Insurance Provider Marital Status  \
0       kenneth63@example.net              Aetna        Married 