In [4]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)
fake = Faker()

# Realistic Insurance Providers with Market Share Percentages
INSURANCE_PROVIDERS = [
    {'name': 'UnitedHealthcare', 'market_share': 18},
    {'name': 'Anthem', 'market_share': 14},
    {'name': 'Humana', 'market_share': 10},
    {'name': 'Cigna', 'market_share': 9},
    {'name': 'Blue Cross Blue Shield', 'market_share': 35},
    {'name': 'Aetna', 'market_share': 8},
    {'name': 'Kaiser Permanente', 'market_share': 6}
]
LOW_REIMBURSEMENT_INSURERS = ['Kaiser Permanente', 'Aetna']

# Common ICD-10 Diagnosis Codes
ICD10_CODES = [
    {'code': 'E11.9', 'description': 'Type 2 Diabetes', 'frequency': 15},
    {'code': 'I10', 'description': 'Hypertension', 'frequency': 12},
    {'code': 'M17.9', 'description': 'Osteoarthritis', 'frequency': 10},
    {'code': 'E78.5', 'description': 'Hyperlipidemia', 'frequency': 8},
    {'code': 'J44.9', 'description': 'COPD', 'frequency': 7},
    {'code': 'Z23', 'description': 'Immunization', 'frequency': 6},
    {'code': 'N18.9', 'description': 'CKD', 'frequency': 5},
    {'code': 'F32.9', 'description': 'Depression', 'frequency': 5},
    {'code': 'K21.9', 'description': 'GERD', 'frequency': 4},
    {'code': 'R10.9', 'description': 'Abdominal Pain', 'frequency': 4}
]
RISKY_DIAGNOSES = ['F32.9', 'N18.9', 'J44.9']

# CPT Procedure Codes
CPT_CODES = [
    {'code': '99213', 'description': 'Visit Established', 'frequency': 20},
    {'code': '99214', 'description': 'Visit Established', 'frequency': 15},
    {'code': '99203', 'description': 'Visit New', 'frequency': 12},
    {'code': '99215', 'description': 'Visit Established', 'frequency': 10},
    {'code': '99204', 'description': 'Visit New', 'frequency': 8},
    {'code': '99212', 'description': 'Visit Established', 'frequency': 7},
    {'code': '99211', 'description': 'Visit Established', 'frequency': 6},
    {'code': '99205', 'description': 'Visit New', 'frequency': 5},
    {'code': '90471', 'description': 'Immunization', 'frequency': 4},
    {'code': '99202', 'description': 'Visit New', 'frequency': 3}
]

# Specialties and Denial Rates
SPECIALTY_DENIAL_RATES = {
    'Plastic Surgery': 0.28,
    'Emergency Medicine': 0.22,
    'Radiology': 0.20,
    'General Surgery': 0.19,
    'Internal Medicine': 0.15,
    'Family Medicine': 0.14,
    'Ophthalmology': 0.13
}

# Denial Categories and Reasons
DENIAL_REASONS = {
    'Front Office': [
        {'reason': 'Incomplete Registration Data', 'percentage': 50},
        {'reason': 'Eligibility Verification Failure', 'percentage': 30},
        {'reason': 'Missing Prior Authorization', 'percentage': 12},
        {'reason': 'Incorrect Insurance Info', 'percentage': 8}
    ],
    'Middle Office': [
        {'reason': 'Inadequate Clinical Documentation', 'percentage': 30},
        {'reason': 'Incorrect Procedure Codes', 'percentage': 25},
        {'reason': 'Coding Errors', 'percentage': 20},
        {'reason': 'Missing Justification', 'percentage': 15},
        {'reason': 'Inappropriate Codes', 'percentage': 10}
    ],
    'Back Office': [
        {'reason': 'Duplicate Claim', 'percentage': 35},
        {'reason': 'Coordination Issue', 'percentage': 25},
        {'reason': 'Timely Filing Exceeded', 'percentage': 20},
        {'reason': 'Included in Another Claim', 'percentage': 12},
        {'reason': 'Billing Inconsistencies', 'percentage': 8}
    ]
}
DENIAL_CATEGORY_PERCENTAGES = {
    'Front Office': 0.45,
    'Middle Office': 0.30,
    'Back Office': 0.25
}

def weighted_choice(choices):
    total = sum(choice.get('frequency', choice.get('market_share', 1)) for choice in choices)
    r = random.uniform(0, total)
    upto = 0
    for choice in choices:
        weight = choice.get('frequency', choice.get('market_share', 1))
        if upto + weight >= r:
            return choice
        upto += weight

def weighted_choice_with_percentage(choices):
    total = sum(choice['percentage'] for choice in choices)
    r = random.uniform(0, total)
    upto = 0
    for choice in choices:
        if upto + choice['percentage'] >= r:
            return choice['reason']
        upto += choice['percentage']

def generate_claims_dataset(n=10000):
    data = []
    for _ in range(n):
        patient_id = fake.unique.random_number(digits=6)
        age = random.randint(18, 85)
        gender = random.choice(['Male', 'Female', 'Other'])
        specialty = random.choices(list(SPECIALTY_DENIAL_RATES.keys()), weights=list(SPECIALTY_DENIAL_RATES.values()))[0]
        diagnosis = weighted_choice(ICD10_CODES)['code']
        procedure = weighted_choice(CPT_CODES)['code']
        insurance = weighted_choice(INSURANCE_PROVIDERS)['name']
        claim_amount = round(random.uniform(100, 50000), 2)
        service_date = fake.date_between(start_date='-2y', end_date='-30d')
        submission_date = fake.date_between(start_date=service_date, end_date='today')
        provider_id = fake.unique.random_number(digits=5)

        # Derived features
        is_high_cost = claim_amount > 10000
        is_risky_diagnosis = diagnosis in RISKY_DIAGNOSES
        is_low_reimbursement_insurer = insurance in LOW_REIMBURSEMENT_INSURERS

        base_denial_prob = SPECIALTY_DENIAL_RATES[specialty]
        denial_boost = 0.05 * is_high_cost + 0.05 * is_risky_diagnosis + 0.05 * is_low_reimbursement_insurer
        final_denial_prob = min(1.0, base_denial_prob + denial_boost)
        is_denied = random.random() < final_denial_prob

        if is_denied:
            denial_category = random.choices(list(DENIAL_CATEGORY_PERCENTAGES.keys()),
                                             weights=list(DENIAL_CATEGORY_PERCENTAGES.values()))[0]
            denial_reason = weighted_choice_with_percentage(DENIAL_REASONS[denial_category])
            status = 'Denied'
        else:
            denial_category = None
            denial_reason = None
            status = 'Approved'

        data.append({
            'claim_id': fake.unique.random_number(digits=7),
            'patient_id': patient_id,
            'patient_age': age,
            'patient_gender': gender,
            'specialty': specialty,
            'provider_id': provider_id,
            'procedure_code': procedure,
            'diagnosis_code': diagnosis,
            'service_date': service_date,
            'claim_submission_date': submission_date,
            'claim_amount': claim_amount,
            'insurance_provider': insurance,
            'is_high_cost': is_high_cost,
            'is_risky_diagnosis': is_risky_diagnosis,
            'is_low_reimbursement_insurer': is_low_reimbursement_insurer,
            'is_denied': is_denied,
            'claim_status': status,
            'denial_reason': denial_reason,
            'denial_category': denial_category
        })

    return pd.DataFrame(data)

# Generate and save
df = generate_claims_dataset()
df.to_csv("medical_claims_dataset_v2.csv", index=False)
print("✅ Dataset saved as medical_claims_dataset.csv")


✅ Dataset saved as medical_claims_dataset.csv
