In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()
np.random.seed(123)
random.seed(123)

# Reuse constants from your previous code
SPECIALTIES = ['Plastic Surgery', 'Emergency Medicine', 'Radiology', 'Critical Care', 'General Surgery',
               'Physical Medicine and Rehabilitation', 'Anesthesiology', 'Orthopedics', 'Pulmonary Medicine',
               'Neurology', 'Infectious Diseases', 'Rheumatology', 'Urology', 'OB-GYN', 'Allergy and Immunology',
               'Oncology', 'Psychiatry', 'Gastroenterology', 'Dermatology', 'Internal Medicine', 'Cardiology',
               'Nephrology', 'Family Medicine', 'Otolaryngology', 'Pediatrics', 'Diabetes and Endocrinology',
               'Ophthalmology']

ICD10_CODES = ['E11.9', 'I10', 'M17.9', 'E78.5', 'J44.9', 'Z23', 'N18.9', 'F32.9', 'K21.9', 'R10.9']
CPT_CODES = ['99213', '99214', '99203', '99215', '99204', '99212', '99211', '99205', '90471', '99202']
INSURANCE_PROVIDERS = ['UnitedHealthcare', 'Anthem', 'Humana', 'Cigna', 'Blue Cross Blue Shield', 'Aetna', 'Kaiser Permanente']

def generate_new_claims(n=200):
    data = {
        'claim_id': [],
        'patient_id': [],
        'patient_age': [],
        'patient_gender': [],
        'specialty': [],
        'provider_id': [],
        'procedure_code': [],
        'diagnosis_code': [],
        'service_date': [],
        'claim_submission_date': [],
        'claim_amount': [],
        'insurance_provider': []
    }

    for _ in range(n):
        data['claim_id'].append(fake.unique.random_number(digits=7))
        data['patient_id'].append(fake.unique.random_number(digits=6))
        data['patient_age'].append(random.randint(18, 85))
        data['patient_gender'].append(random.choice(['Male', 'Female', 'Other']))
        data['specialty'].append(random.choice(SPECIALTIES))
        data['provider_id'].append(fake.unique.random_number(digits=5))
        data['procedure_code'].append(random.choice(CPT_CODES))
        data['diagnosis_code'].append(random.choice(ICD10_CODES))
        service_date = fake.date_between(start_date='-2y', end_date='today')
        data['service_date'].append(service_date)
        data['claim_submission_date'].append(fake.date_between(start_date=service_date, end_date='today'))
        data['claim_amount'].append(round(random.uniform(100, 50000), 2))
        data['insurance_provider'].append(random.choice(INSURANCE_PROVIDERS))

    return pd.DataFrame(data)

# Save new data
new_claims_df = generate_new_claims()
new_claims_df.to_csv("new_claims.csv", index=False)
print("✅ New synthetic claims saved to 'new_claims.csv'")


✅ New synthetic claims saved to 'new_claims.csv'
