# **NOTEBOOK FOR `CLEANING` AND `AGGREGATING` THE `SYNTHEA` DATASET**
***

***I need to create some aggregated tables, in order to proceed in my analysis***

## **Creating Connections with the database**

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

In [2]:
def get_project_root(project_name="ranojoy_data_analytics_projects"):
    current_path = Path.cwd()
    if project_name in str(current_path):
        while current_path.name != project_name:
            current_path = current_path.parent
        return current_path
    else:
        for path in current_path.rglob(project_name):
            if path.is_dir():
                return path
        raise FileNotFoundError(f"Could not find project: {project_name}")

repo_root = get_project_root()
data_path = repo_root / "About Synthea - Why Patients Readmit More"

sys.path.append(str(data_path))

import src.functions as F

In [3]:
engine = F.get_engine()

In [4]:
engine

Engine(mysql+pymysql://root:***@localhost:3306/synthea_medical_dataset)

## **Cleaning process of the data and creating aggregated tables**

In [5]:
necessary_tables = ['encounters','careplans','conditions','medications','procedures','patients','providers']

In [6]:
F.select('encounters').head(1)

Unnamed: 0,Id,start,stop,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e002090d-4e92-300e-b41e-7d1f21dee4c6,e6283e46-fd81-3611-9459-0edb1c3da357,6e2f1a2d-27bd-3701-8d08-dae202c58632,ambulatory,185345009,Encounter for symptom,129.16,129.16,54.16,10509002.0,Acute bronchitis (disorder),25.0


In [7]:
F.find_nulls('encounters')

Unnamed: 0,column_names,null_values
13,REASONCODE,39569
14,REASONDESCRIPTION,39569


In [8]:
# # Reforming the date values
# query = """
# update encounters
# set start = str_to_date(
#     trim(replace(replace(replace(start, '"', ''), '\r', ''), '\n', '')),
#     '%Y-%m-%dT%H:%i:%sZ'
# );
# """
# F.run(query)

# query = """
# ALTER TABLE encounters
# MODIFY start datetime;
# """
# F.run(query)

In [9]:
# # Reforming the date values
# query = """
# update encounters
# set stop = str_to_date(
#     trim(replace(replace(replace(stop, '"', ''), '\r', ''), '\n', '')),
#     '%Y-%m-%dT%H:%i:%sZ'
# );
# """
# F.run(query)

# query = """
# ALTER TABLE encounters
# MODIFY stop datetime;
# """
# F.run(query)

In [10]:
F.sql('select count(distinct code) as total_code, count(distinct description) as total_description from encounters')

Unnamed: 0,total_code,total_description
0,43,49


In [11]:
F.sql('select count(distinct reasoncode) as total_code, count(distinct reasondescription) as total_description from encounters')

Unnamed: 0,total_code,total_description
0,80,80


In [12]:
F.select('encounters').head(1)

Unnamed: 0,Id,start,stop,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e002090d-4e92-300e-b41e-7d1f21dee4c6,e6283e46-fd81-3611-9459-0edb1c3da357,6e2f1a2d-27bd-3701-8d08-dae202c58632,ambulatory,185345009,Encounter for symptom,129.16,129.16,54.16,10509002.0,Acute bronchitis (disorder),25.0


In [13]:
# query = '''
#     alter table encounters
#     add column duration float
# '''
# F.run(query)

In [14]:
# query = '''
#     update encounters
#     set duration = timestampdiff(minute,start,stop)
# '''
# F.run(query)

In [15]:
F.select('encounters').head(1)

Unnamed: 0,Id,start,stop,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e002090d-4e92-300e-b41e-7d1f21dee4c6,e6283e46-fd81-3611-9459-0edb1c3da357,6e2f1a2d-27bd-3701-8d08-dae202c58632,ambulatory,185345009,Encounter for symptom,129.16,129.16,54.16,10509002.0,Acute bronchitis (disorder),25.0


In [16]:
encounters_agg = F.sql('''
    select id as encounter,
    start,stop,patient,provider,encounterclass,base_encounter_cost,payer_coverage,duration
                       from encounters
''')

In [17]:
encounters_agg.head(3)

Unnamed: 0,encounter,start,stop,patient,provider,encounterclass,base_encounter_cost,payer_coverage,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e6283e46-fd81-3611-9459-0edb1c3da357,ambulatory,129.16,54.16,25.0
1,e88bc3a9-007c-405e-aabc-792a38f4aa2b,2012-01-23 17:45:28,2012-01-23 18:00:28,034e9e3b-2def-4559-bb2a-7850888ae060,6f1d59a7-a5bd-3cf9-9671-5bad2f351c28,wellness,129.16,129.16,15.0
2,8f104aa7-4ca9-4473-885a-bba2437df588,2001-05-01 15:02:18,2001-05-01 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,af01a385-31d3-3c77-8fdb-2867fe88df2f,ambulatory,129.16,0.0,15.0


***

In [18]:
F.select('careplans')

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,d2500b8c-e830-433a-8b9d-368d30741520,2010-01-23,2012-01-23,034e9e3b-2def-4559-bb2a-7850888ae060,d0c40d10-8d87-447e-836e-99d26ad52ea5,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
1,07d9ddd8-dfa1-4e43-9bfe-39f63f4ace15,2011-05-13,2011-08-02,10339b10-3cd1-4ac3-ac13-ec26728cb592,e1ab4933-07a1-49f0-b4bd-05500919061d,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
2,a3bb6e99-3b99-44b3-974c-e230b4511b5c,2011-12-31,2012-11-30,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,16300c56-a035-4126-a656-68c093da6dfc,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
3,9f5284b7-425a-486a-b36e-ab818c018f2f,2016-12-29,2017-01-05,034e9e3b-2def-4559-bb2a-7850888ae060,3b639086-5fbc-4720-8c31-e8c8c0f1d660,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
4,47ede16c-c216-4f81-a16b-0e858de9cdc3,2017-01-22,2017-02-12,10339b10-3cd1-4ac3-ac13-ec26728cb592,4ec8d55b-05fc-42a5-bfa3-1e233874a362,225358003,Wound care,284551006.0,Laceration of foot


In [19]:
query = '''
    select encounter,
    min(start) as careplan_start,
    max(stop) as careplan_end,
    case when max(stop) is null then 1 else 0 end as is_ongoing_careplan,
    count(distinct code) as total_careplans
    from careplans
    group by encounter
'''
careplan_agg = F.sql(query)

In [20]:
careplan_agg.head(3)

Unnamed: 0,encounter,careplan_start,careplan_end,is_ongoing_careplan,total_careplans
0,000fef3b-ba44-4b01-952e-7eeb8a5732a8,1976-07-30,1976-10-05,0,1
1,001d66d9-b1bc-425b-a3a7-3c24de8fcb16,2010-12-29,2011-08-03,0,1
2,001ed2aa-e772-4407-9902-38ef70bd1a1e,2011-10-06,2011-11-05,0,1


***

In [21]:
F.select('conditions')

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2001-05-01,,1d604da9-9a81-4ba9-80c2-de3375d59b40,8f104aa7-4ca9-4473-885a-bba2437df588,40055000,Chronic sinusitis (disorder)
1,2011-08-09,2011-08-16,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,9d35ec9f-352a-4629-92ef-38eae38437e7,444814009,Viral sinusitis (disorder)
2,2011-11-16,2011-11-26,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,ae7555a9-eaff-4c09-98a7-21bc6ed1b1fd,195662009,Acute viral pharyngitis (disorder)
3,2011-05-13,2011-05-27,10339b10-3cd1-4ac3-ac13-ec26728cb592,e1ab4933-07a1-49f0-b4bd-05500919061d,10509002,Acute bronchitis (disorder)
4,2011-02-06,2011-02-14,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,b8f76eba-7795-4dcd-a544-f27ac2ef3d46,195662009,Acute viral pharyngitis (disorder)


In [22]:
query = '''
    select encounter,
    min(start) as condition_start,
    max(stop) as condition_end,
    case when max(stop) is null then 1 else 0 end as is_ongoing_condition,
    count(distinct code) as total_conditions
    from conditions
    group by encounter
'''
condition_agg = F.sql(query)

In [23]:
condition_agg.head(3)

Unnamed: 0,encounter,condition_start,condition_end,is_ongoing_condition,total_conditions
0,000fef3b-ba44-4b01-952e-7eeb8a5732a8,1976-07-30,1976-10-05,0,1
1,001d66d9-b1bc-425b-a3a7-3c24de8fcb16,2010-12-29,2011-08-03,0,1
2,001ed2aa-e772-4407-9902-38ef70bd1a1e,2011-10-06,2011-11-05,0,1


***

In [24]:
F.select('medications').head(3)

Unnamed: 0,start,stop,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION
0,2010-05-05 00:26:23,2011-04-30 00:26:23,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,1e0d6b0e-1711-4a25-99f9-b1c700c9b260,389221,Etonogestrel 68 MG Drug Implant,677.08,0.0,12,8124.96,,
1,2011-04-30 00:26:23,2012-04-24 00:26:23,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,6aa37300-d1b4-48e7-a2f8-5e0f70f48f38,389221,Etonogestrel 68 MG Drug Implant,624.09,0.0,12,7489.08,,
2,2012-04-24 00:26:23,2013-04-19 00:26:23,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,7253a9f9-6f6d-429a-926a-7b1d424eae3f,748856,Yaz 28 Day Pack,43.32,0.0,12,519.84,,


In [25]:
# # Reforming the date values
# query = """
# update medications
# set start = str_to_date(
#     trim(replace(replace(replace(start, '"', ''), '\r', ''), '\n', '')),
#     '%Y-%m-%dT%H:%i:%sZ'
# );
# """
# F.run(query)

# query = """
# ALTER TABLE medications
# MODIFY start datetime;
# """
# F.run(query)

In [26]:
# # Reforming the date values
# query = """
# update medications
# set stop = str_to_date(
#     trim(replace(replace(replace(stop, '"', ''), '\r', ''), '\n', '')),
#     '%Y-%m-%dT%H:%i:%sZ'
# );
# """
# F.run(query)

# query = """
# ALTER TABLE medications
# MODIFY stop datetime;
# """
# F.run(query)

In [27]:
query = '''
    select encounter,
    min(start) as medication_start,
    max(stop) as medication_end,
    count(distinct code) as total_medicines,
    round(avg(base_cost),2) as avg_base_medicine_cost,
    round(avg(totalcost),2) as avg_total_medicine_cost,
    count(distinct reasoncode) as reasons_for_medications 
    from medications
    group by encounter;
'''
medicine_agg = F.sql(query)

In [28]:
medicine_agg.head(3)

Unnamed: 0,encounter,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications
0,0000d0b7-937c-498a-9da4-32a5d29dee39,1994-09-07 04:48:12,1995-03-15 04:48:12,2,270.56,1623.33,2
1,0004e2e7-e3f2-4d25-b3eb-813710f2f9bc,2001-11-11 02:07:45,2002-11-17 02:07:45,1,263.49,3161.88,1
2,00052e41-7581-46e1-8c7f-d31bbb50ae72,2000-03-03 08:37:29,2000-05-05 08:37:29,2,433.29,866.59,1


***

In [29]:
F.select('procedures')

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
0,2011-04-30T00:26:23Z,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,6aa37300-d1b4-48e7-a2f8-5e0f70f48f38,169553002,Insertion of subcutaneous contraceptive,14896.56,,
1,2010-07-27T12:58:08Z,10339b10-3cd1-4ac3-ac13-ec26728cb592,dae2b7cb-1316-4b78-954f-fa610a6c6d0e,430193006,Medication Reconciliation (procedure),726.51,,
2,2010-11-20T03:04:34Z,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,7ff86631-0378-4bfc-92ce-1edd697eb18e,430193006,Medication Reconciliation (procedure),788.5,,
3,2011-02-07T03:04:34Z,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,b8f76eba-7795-4dcd-a544-f27ac2ef3d46,117015009,Throat culture (procedure),2070.44,195662009.0,Acute viral pharyngitis (disorder)
4,2011-04-19T03:04:34Z,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,640837d9-845a-433c-9fad-47426664a69d,117015009,Throat culture (procedure),2479.39,195662009.0,Acute viral pharyngitis (disorder)


In [30]:
query = '''
    select encounter, 
    count(distinct code) as total_procedures,
    round(avg(base_cost),2) as avg_procedure_cost
    from procedures
    group by encounter
'''
procedure_agg = F.sql(query)

In [31]:
procedure_agg.head(3)

Unnamed: 0,encounter,total_procedures,avg_procedure_cost
0,000186d2-1316-4b58-be65-272233953fcb,5,3652.09
1,0002adbb-59c3-494a-bb17-0d1acc692ae8,1,12161.14
2,00055b87-0a03-4ca8-a69d-924bbb84033a,1,516.65


***

In [32]:
F.select('patients')

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
0,1d604da9-9a81-4ba9-80c2-de3375d59b40,1989-05-25,,999-76-6866,S99984236,X19277260X,Mr.,José Eduardo181,Gómez206,,...,Marigot Saint Andrew Parish DM,427 Balistreri Way Unit 19,Chicopee,Massachusetts,Hampden County,1013.0,42.228354,-72.562951,271227.08,1334.88
1,034e9e3b-2def-4559-bb2a-7850888ae060,1983-11-14,,999-73-5361,S99962402,X88275464X,Mr.,Milo271,Feil794,,...,Danvers Massachusetts US,422 Farrell Path Unit 69,Somerville,Massachusetts,Middlesex County,2143.0,42.360697,-71.126531,793946.01,3204.49
2,10339b10-3cd1-4ac3-ac13-ec26728cb592,1992-06-02,,999-27-3385,S99972682,X73754411X,Mr.,Jayson808,Fadel536,,...,Springfield Massachusetts US,1056 Harris Lane Suite 70,Chicopee,Massachusetts,Hampden County,1020.0,42.181642,-72.608842,574111.9,2606.4
3,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,1978-05-27,,999-85-4926,S99974448,X40915583X,Mrs.,Mariana775,Rutherford999,,...,Yarmouth Massachusetts US,999 Kuhn Forge,Lowell,Massachusetts,Middlesex County,1851.0,42.636143,-71.343255,935630.3,8756.19
4,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,1996-10-18,,999-60-7372,S99915787,X86772962X,Mr.,Gregorio366,Auer97,,...,Patras Achaea GR,1050 Lindgren Extension Apt 38,Boston,Massachusetts,Suffolk County,2135.0,42.352434,-71.02861,598763.07,3772.2


In [33]:
query = '''
    select
    id as patient_id,
    case when 
    deathdate is null then timestampdiff(year,birthdate,"2021-12-31") 
    else timestampdiff(year,birthdate,deathdate)
    end as age_as_2022,
    marital,
    race,
    ethnicity,
    gender as patient_gender
    from patients;
'''
patient_agg = F.sql(query)

***

In [34]:
F.select('providers')

Unnamed: 0,Id,ORGANIZATION,NAME,GENDER,SPECIALITY,ADDRESS,CITY,STATE,ZIP,LAT,LON,UTILIZATION
0,3421aa75-dec7-378d-a9e0-0bc764e4cb0d,ef58ea08-d883-3957-8300-150554edc8fb,Tomas436 Sauer652,M,GENERAL PRACTICE,60 HOSPITAL ROAD,LEOMINSTER,MA,1453,42.520838,-71.770876,1557
1,c9b3c857-2e24-320c-a79a-87b8a60de63c,69176529-fd1f-3b3f-abce-a0a3626769eb,Suzette512 Monahan736,F,GENERAL PRACTICE,330 MOUNT AUBURN STREET,CAMBRIDGE,MA,2138,42.375967,-71.118275,2296
2,0359f968-d1a6-30eb-b1cc-e6cc0b4d3513,5e765f2b-e908-3888-9fc7-df2cb87beb58,Gaynell126 Streich926,F,GENERAL PRACTICE,211 PARK STREET,ATTLEBORO,MA,2703,41.931653,-71.294503,2287
3,446d1609-858f-3a54-8a52-0c4eacedd00e,f1fbcbfb-fcfa-3bd2-b7f4-df20f1b3c3a4,Patricia625 Salgado989,F,GENERAL PRACTICE,ONE GENERAL STREET,LAWRENCE,MA,1842,42.700273,-71.161357,1327
4,e6283e46-fd81-3611-9459-0edb1c3da357,e002090d-4e92-300e-b41e-7d1f21dee4c6,Jeanmarie510 Beatty507,F,GENERAL PRACTICE,1493 CAMBRIDGE STREET,CAMBRIDGE,MA,2138,42.375967,-71.118275,3199


In [35]:
query = '''
    select id as provider_id,
    gender as provider_gender,
    utilization 
    from providers;
'''
provider_agg = F.sql(query)

In [36]:
encounters_agg.sample(1)

Unnamed: 0,encounter,start,stop,patient,provider,encounterclass,base_encounter_cost,payer_coverage,duration
3475,022ad487-e41c-43ba-90f3-eb2d6711f4d3,1998-07-19 12:55:35,1998-07-19 13:38:35,e061409e-4b85-4ec1-b1f7-02677d51f763,0a8a9359-7b33-3256-a068-b5a7d18ebe4b,ambulatory,129.16,69.16,43.0


## **Viewing the aggregated tables**

In [37]:
agg_tables = [encounters_agg,patient_agg,careplan_agg,procedure_agg,condition_agg,medicine_agg,provider_agg]

In [38]:
for table in agg_tables:
    display(table.sample(3))

Unnamed: 0,encounter,start,stop,patient,provider,encounterclass,base_encounter_cost,payer_coverage,duration
2862,1eda4b69-550e-481c-b23a-60e9f2a9fd49,2012-10-22 17:24:07,2012-10-22 18:24:07,ed5cc81e-619f-471d-aee6-2934124c58e4,de75d491-eca3-343c-90a0-a32824647526,ambulatory,129.16,69.16,60.0
30274,9bfe0480-137b-4b8e-a014-d5d15ee177b5,1999-06-08 23:35:03,1999-06-09 02:50:03,cae10920-f977-48b4-a0d3-4d70ad561fd1,8f9aea5b-fd01-37c0-8931-18b6d64bdae6,ambulatory,129.16,89.16,195.0
45855,496c0fc5-782a-496a-a0a8-1ac36f6a7d1a,1974-11-26 09:30:42,1974-11-26 12:45:42,19d2cfb8-439b-454a-b47e-5274c219005b,793c18cd-8269-387d-b998-c135459e4248,ambulatory,129.16,0.0,195.0


Unnamed: 0,patient_id,age_as_2022,marital,race,ethnicity,patient_gender
1157,d9988dce-ec05-4138-8f04-b0c77473a260,4,,white,nonhispanic,M
149,4a52ea9c-d410-4b78-a4da-6053e2ed0787,47,M,black,nonhispanic,F
1076,c518f618-304a-4153-af34-b11caaa566bd,43,M,white,nonhispanic,F


Unnamed: 0,encounter,careplan_start,careplan_end,is_ongoing_careplan,total_careplans
2611,bf31f1f4-a0fd-4d3f-839d-9217af8ee02f,1949-06-15,,1,1
2964,d8e07866-af2a-4874-8540-370a54857f48,2013-07-11,2013-09-25,0,1
808,3cdc18b9-08bf-4c9f-8d40-2984cdd43a03,2019-03-31,2019-11-16,0,1


Unnamed: 0,encounter,total_procedures,avg_procedure_cost
15138,b79f3ece-e642-4099-bcf8-b2856e0be377,20,3487.84
8365,65033b9b-af05-4bf8-93a6-a81730629ca9,2,516.65
7360,58e94ba8-4560-4f4a-bad7-3e347e163ab2,1,516.65


Unnamed: 0,encounter,condition_start,condition_end,is_ongoing_condition,total_conditions
288,0a45453d-3b6a-4739-9b54-948045702fcd,2013-08-29,2013-09-12,0,1
5649,bb4e8bd2-5fea-4ae2-9e89-dd557730546f,2017-10-07,2017-10-21,0,1
5675,bc12ab28-03b1-40c3-aac7-3fe2db5ff94d,2019-05-03,2019-07-04,0,1


Unnamed: 0,encounter,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications
2489,1776f1bd-0974-4abf-b006-4c535baa622a,2015-10-05 13:57:11,2015-11-02 13:57:11,5,282.72,282.72,2
19117,b3044f22-bf1d-4f20-b919-e51da15ac3cf,1981-12-26 18:44:55,1983-01-01 18:44:55,4,134.01,1532.69,1
14980,8c58a192-5f7e-49c6-87b1-1a4525f51ee3,1963-11-16 22:09:56,1964-11-21 22:09:56,1,263.49,3161.88,1


Unnamed: 0,provider_id,provider_gender,utilization
4514,33739d2c-d387-33d6-b47e-8537235ed2ca,F,0
5656,b2a312b9-5e6d-30a2-9bb8-6bf80903c0b8,M,0
4716,cd798b8b-6341-317c-9b3b-580fac1ff542,F,0


In [39]:
encounters_agg.columns = encounters_agg.columns.str.lower()

In [40]:
encounters_agg.rename(columns={'patient':'patient_id','provider':'provider_id'},inplace=True)

## **Forming the aggregated dataset**

In [41]:
data = (encounters_agg.merge(procedure_agg,on='encounter',how='left')
    .merge(medicine_agg,on='encounter',how='left')
    .merge(condition_agg,on='encounter',how='left')
    .merge(careplan_agg,on='encounter',how='left')
    .merge(patient_agg,on='patient_id',how='left')
    .merge(provider_agg,on='provider_id',how='left'))

In [42]:
pd.set_option('display.max_columns',None)

In [43]:
data.head()

Unnamed: 0,encounter,start,stop,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e6283e46-fd81-3611-9459-0edb1c3da357,ambulatory,129.16,54.16,25.0,,,NaT,NaT,,,,,,,,,2010-01-23,2012-01-23,0.0,1.0,38,M,white,nonhispanic,M,F,3199
1,e88bc3a9-007c-405e-aabc-792a38f4aa2b,2012-01-23 17:45:28,2012-01-23 18:00:28,034e9e3b-2def-4559-bb2a-7850888ae060,6f1d59a7-a5bd-3cf9-9671-5bad2f351c28,wellness,129.16,129.16,15.0,,,NaT,NaT,,,,,,,,,,,,,38,M,white,nonhispanic,M,F,33
2,8f104aa7-4ca9-4473-885a-bba2437df588,2001-05-01 15:02:18,2001-05-01 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,af01a385-31d3-3c77-8fdb-2867fe88df2f,ambulatory,129.16,0.0,15.0,,,NaT,NaT,,,,,2001-05-01,,1.0,1.0,,,,,32,M,white,hispanic,M,M,2585
3,b85c339a-6076-43ed-b9d0-9cf013dec49d,2011-07-28 15:02:18,2011-07-28 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,bb17e691-262b-3546-93d5-d88e7de93246,wellness,129.16,0.0,15.0,,,NaT,NaT,,,,,,,,,,,,,32,M,white,hispanic,M,F,58
4,dae2b7cb-1316-4b78-954f-fa610a6c6d0e,2010-07-27 12:58:08,2010-07-27 13:28:08,10339b10-3cd1-4ac3-ac13-ec26728cb592,7ed6b84a-b847-3744-9d42-15c42297a0c2,wellness,129.16,129.16,30.0,1.0,726.51,NaT,NaT,,,,,,,,,,,,,29,M,white,nonhispanic,M,M,59


## **Cleaning the aggregated data**

In [44]:
# 1. PREPARE DATA (NO COPY)
data['start'] = pd.to_datetime(data['start'])
data = data.sort_values(['patient_id', 'start'])

# 2. SELF-MERGE USING data
dup = data.merge(
    data,
    on='patient_id',
    how='inner',
    suffixes=('_first', '_second')
)

# 3. FILTER A → B WITHIN 30 DAYS
dup = dup[
    (dup['start_second'] > dup['start_first']) &
    (dup['start_second'] <= dup['start_first'] + pd.Timedelta(days=30))
]

# 4. ADD COLUMN DIRECTLY TO data
data['caused_readmission'] = 0
data.loc[data['encounter'].isin(dup['encounter_first']), 'caused_readmission'] = 1


In [45]:
data.sample(1)

Unnamed: 0,encounter,start,stop,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission
32749,4e5a9349-de4a-4520-941b-07de7d5b34a2,2005-06-09 21:09:07,2005-06-09 21:24:07,e47e6347-73d4-4244-b92e-8a7cea33d434,aa89beb2-7bc6-35fa-83f7-4b32039e84eb,outpatient,129.16,0.0,15.0,,,2005-06-09 21:09:07,2005-06-23 21:09:07,2.0,15.54,15.54,0.0,2005-06-09,2005-07-22,0.0,1.0,,,,,1,,white,nonhispanic,M,M,1999,0


In [46]:
data.rename(columns={'start':'encounter_start','stop':'encounter_finish'},inplace=True)

In [47]:
data.sample(1)

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission
15682,ef08f50c-1e4e-411b-800c-354882cca0b5,2020-04-18 09:55:43,2020-04-18 13:10:43,3f336702-bf73-4fc8-bd59-3ba77fd65d0d,8f9aea5b-fd01-37c0-8931-18b6d64bdae6,ambulatory,129.16,89.16,195.0,1.0,516.65,2020-04-18 09:55:43,2020-04-18 09:55:43,1.0,263.49,263.49,1.0,,,,,,,,,98,S,white,nonhispanic,M,F,4828,1


In [48]:
data.isnull().sum()[data.isnull().sum()>0].reset_index(name = 'null_counts')

Unnamed: 0,index,null_counts
0,total_procedures,32274
1,avg_procedure_cost,32274
2,medication_start,26077
3,medication_end,27309
4,total_medicines,26077
5,avg_base_medicine_cost,26077
6,avg_total_medicine_cost,26077
7,reasons_for_medications,26077
8,condition_start,45696
9,condition_end,48884


In [49]:
data.columns

Index(['encounter', 'encounter_start', 'encounter_finish', 'patient_id',
       'provider_id', 'encounterclass', 'base_encounter_cost',
       'payer_coverage', 'duration', 'total_procedures', 'avg_procedure_cost',
       'medication_start', 'medication_end', 'total_medicines',
       'avg_base_medicine_cost', 'avg_total_medicine_cost',
       'reasons_for_medications', 'condition_start', 'condition_end',
       'is_ongoing_condition', 'total_conditions', 'careplan_start',
       'careplan_end', 'is_ongoing_careplan', 'total_careplans', 'age_as_2022',
       'marital', 'race', 'ethnicity', 'patient_gender', 'provider_gender',
       'utilization', 'caused_readmission'],
      dtype='object')

In [50]:
data[['total_procedures', 'avg_procedure_cost', 'total_medicines',
       'avg_base_medicine_cost', 'avg_total_medicine_cost',
       'reasons_for_medications', 'total_conditions', 'total_careplans']] = data[['total_procedures', 'avg_procedure_cost', 'total_medicines',
       'avg_base_medicine_cost', 'avg_total_medicine_cost',
       'reasons_for_medications', 'total_conditions', 'total_careplans']].fillna(0)

In [51]:
data[['marital']] =data[['marital']].fillna('unknown')

In [52]:
data[['is_ongoing_condition', 'is_ongoing_careplan']] = data[['is_ongoing_condition', 'is_ongoing_careplan']].fillna(-1)

In [53]:
data.isnull().sum()[data.isnull().sum()>0].reset_index(name = 'null_counts')

Unnamed: 0,index,null_counts
0,medication_start,26077
1,medication_end,27309
2,condition_start,45696
3,condition_end,48884
4,careplan_start,49874
5,careplan_end,51395


In [54]:
data['marital']=np.where(data['marital'] =='M','MARRIED',np.where(data['marital']=='S','SINGLE','UNKNOWN'))

In [55]:
data['encounterclass']=data['encounterclass'].apply(lambda x: x.upper())

In [56]:
data.head()

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1.0,648.01,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,white,nonhispanic,M,M,25,0
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,15.0,0.0,0.0,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,white,nonhispanic,M,M,25,0
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,30.0,1.0,2230.4,NaT,NaT,0.0,0.0,0.0,0.0,2012-08-03,2012-08-11,0.0,1.0,,,-1.0,0.0,18,UNKNOWN,white,nonhispanic,M,M,1616,0
34109,ce150f69-e3a6-4793-95b6-243f754723c3,2012-10-14 15:06:37,2012-10-14 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,15.0,0.0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1.0,20.48,20.48,1.0,2012-10-14,2012-10-25,0.0,1.0,,,-1.0,0.0,18,UNKNOWN,white,nonhispanic,M,M,1616,0
34110,e905ce81-d1da-46ac-a0fb-c16bce2c77b7,2012-11-20 15:06:37,2012-11-20 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1.0,590.58,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,white,nonhispanic,M,M,25,0


In [57]:
data['race']=data['race'].apply(lambda x: x.upper())

In [58]:
data['ethnicity']=data['ethnicity'].apply(lambda x: x.upper())

In [59]:
data.head()

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1.0,648.01,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,15.0,0.0,0.0,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,30.0,1.0,2230.4,NaT,NaT,0.0,0.0,0.0,0.0,2012-08-03,2012-08-11,0.0,1.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0
34109,ce150f69-e3a6-4793-95b6-243f754723c3,2012-10-14 15:06:37,2012-10-14 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,15.0,0.0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1.0,20.48,20.48,1.0,2012-10-14,2012-10-25,0.0,1.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0
34110,e905ce81-d1da-46ac-a0fb-c16bce2c77b7,2012-11-20 15:06:37,2012-11-20 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1.0,590.58,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0


In [60]:
data['condition_duration']=pd.to_datetime(data['condition_end']) - pd.to_datetime(data['condition_start'])

In [61]:
data['careplan_duration']=pd.to_datetime(data['careplan_end']) - pd.to_datetime(data['careplan_start'])

In [62]:
data.sample(5)

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration
42260,22e5b709-d95e-43ad-9bd6-cb97e9444bfd,2017-12-11 13:57:11,2017-12-11 14:12:11,3acf9313-1874-4dff-ab2a-3187516d92d6,98a31629-e9a3-304d-a821-ebb8d085c535,URGENTCARE,129.16,0.0,15.0,1.0,27610.5,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,100,MARRIED,WHITE,NONHISPANIC,M,F,742,1,NaT,NaT
29050,9d7d094f-101f-4c65-b15c-31b0ffc589ad,1997-06-08 19:05:21,1997-06-08 19:35:21,b48a38ae-3bb7-4e52-add7-bce581c87262,5ed457ac-0af4-3c3c-b230-a1b78319191b,WELLNESS,129.16,89.16,30.0,1.0,509.98,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,87,MARRIED,WHITE,NONHISPANIC,M,M,77,0,NaT,NaT
27491,348813de-b2dd-4bb4-a286-40b0c0c10f10,2020-01-24 11:29:41,2020-01-24 11:59:41,c8c1bcb3-f787-4d53-9e3f-8fc59740daa3,6456aace-c722-3675-9122-3c36e4488388,WELLNESS,129.16,129.16,30.0,1.0,401.49,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,4,UNKNOWN,ASIAN,NONHISPANIC,F,F,61,0,NaT,NaT
32224,f522f250-10aa-4578-ba25-149ed036fe1c,2019-04-02 15:32:23,2019-04-02 16:26:23,199946d9-69de-43e1-9ce6-d6b646b84955,56c64467-dd8a-36ca-91c5-4c729475aa21,AMBULATORY,129.16,59.16,54.0,1.0,516.65,2019-04-02 15:32:23,NaT,2.0,173.55,2256.15,0.0,,,-1.0,0.0,,,-1.0,0.0,4,UNKNOWN,WHITE,NONHISPANIC,F,F,1682,1,NaT,NaT
7933,942b73c2-52f6-463b-8496-f59c384e64a6,2014-07-08 11:02:40,2014-07-08 12:02:40,a27da6fe-7844-4205-a84f-6db17bfb8935,006bcb89-50e6-382f-8784-3cdf167ccbde,EMERGENCY,129.16,69.16,60.0,0.0,0.0,2014-07-08 11:02:40,2014-07-29 11:02:40,1.0,8.49,8.49,0.0,2014-07-08,2014-07-29,0.0,1.0,,,-1.0,0.0,62,MARRIED,ASIAN,NONHISPANIC,M,F,590,0,21 days,NaT


In [63]:
data.rename(columns={'duration':'encounter_duration'},inplace =True)

In [64]:
data[['total_procedures','total_medicines','reasons_for_medications','is_ongoing_condition','total_conditions','is_ongoing_careplan','total_careplans']] =data[['total_procedures','total_medicines','reasons_for_medications','is_ongoing_condition','total_conditions','is_ongoing_careplan','total_careplans']].astype('Int64')

In [65]:
data.head()

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1,648.01,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,30.0,1,2230.4,NaT,NaT,0,0.0,0.0,0,2012-08-03,2012-08-11,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,8 days,NaT
34109,ce150f69-e3a6-4793-95b6-243f754723c3,2012-10-14 15:06:37,2012-10-14 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,15.0,0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1,20.48,20.48,1,2012-10-14,2012-10-25,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,11 days,NaT
34110,e905ce81-d1da-46ac-a0fb-c16bce2c77b7,2012-11-20 15:06:37,2012-11-20 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1,590.58,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53346 entries, 34106 to 37567
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype          
---  ------                   --------------  -----          
 0   encounter                53346 non-null  object         
 1   encounter_start          53346 non-null  datetime64[ns] 
 2   encounter_finish         53346 non-null  datetime64[ns] 
 3   patient_id               53346 non-null  object         
 4   provider_id              53346 non-null  object         
 5   encounterclass           53346 non-null  object         
 6   base_encounter_cost      53346 non-null  float64        
 7   payer_coverage           53346 non-null  float64        
 8   encounter_duration       53346 non-null  float64        
 9   total_procedures         53346 non-null  Int64          
 10  avg_procedure_cost       53346 non-null  float64        
 11  medication_start         27269 non-null  datetime64[ns] 
 12  medication_end     

In [67]:
data['patient_id'].nunique()

1171

In [68]:
def add_readmission_gap(df):
    df = df.sort_values(by=['patient_id', 'encounter_start'])
    df['next_admission_date'] = df.groupby('patient_id')['encounter_start'].shift(-1)
    df['days_to_readmission'] = (df['next_admission_date'] - df['encounter_finish']).dt.days
    
    return df

In [69]:
data = add_readmission_gap(data)

In [70]:
data.head(3)

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,next_admission_date,days_to_readmission
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1,648.01,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT,2011-11-15 15:06:37,370.0
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT,2012-08-03 15:06:37,261.0
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,30.0,1,2230.4,NaT,NaT,0,0.0,0.0,0,2012-08-03,2012-08-11,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,8 days,NaT,2012-10-14 15:06:37,71.0


In [71]:
def add_visit_number(df):
    data = df.copy()
    data = data.sort_values(by=['patient_id', 'encounter_start'])
    data['visit_number'] = data.groupby('patient_id').cumcount() + 1
    
    return data

In [72]:
data = add_visit_number(data)

In [73]:
def add_coverage_ratio(df):
    data = df.copy()
    
    # Avoid division by zero
    data['coverage_ratio'] = data.apply(
        lambda x: x['payer_coverage'] / x['base_encounter_cost'] 
        if x['base_encounter_cost'] > 0 else 0.0, 
        axis=1
    )
    
    data['coverage_ratio'] = data['coverage_ratio'].clip(upper=1.0)
    
    return data

In [74]:
data = add_coverage_ratio(data)

In [75]:
data.sample(5)

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,next_admission_date,days_to_readmission,visit_number,coverage_ratio
7441,867f9383-aba6-4b78-b482-ddc098d03174,2016-02-04 08:16:44,2016-02-04 08:46:44,8aa4df95-ae91-4b6a-809e-85aa89e09b4c,0047cfcf-39f7-3b2a-a9c5-310f413231c9,WELLNESS,129.16,129.16,30.0,1,876.42,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,19,UNKNOWN,WHITE,NONHISPANIC,M,F,27,0,NaT,NaT,2017-02-09 08:16:44,370.0,9,1.0
15093,ec9ee4f4-007c-49c9-ba7d-1808388a8f1c,2016-07-23 09:55:43,2016-07-23 12:10:43,3f336702-bf73-4fc8-bd59-3ba77fd65d0d,8f9aea5b-fd01-37c0-8931-18b6d64bdae6,AMBULATORY,129.16,89.16,135.0,1,516.65,2016-07-23 09:55:43,2016-07-23 09:55:43,1,263.49,263.49,1,,,-1,0,,,-1,0,98,SINGLE,WHITE,NONHISPANIC,M,F,4828,1,NaT,NaT,2016-07-26 09:55:43,2.0,947,0.690307
15909,f0495d4c-9153-46b3-a743-3be2f46c0d14,2013-04-19 11:02:24,2013-04-19 11:17:24,0ba90366-87ad-4f9d-9bcd-c11e63eb6f27,263a09d5-9e62-3bc6-8510-329ae6f78e6d,WELLNESS,129.16,129.16,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,62,SINGLE,WHITE,NONHISPANIC,F,M,51,0,NaT,NaT,2014-04-25 11:02:24,370.0,17,1.0
13684,892c9538-9d1f-4a60-bc35-edd0e1e858f6,2011-10-26 15:17:08,2011-10-26 16:02:08,9ec4030b-2529-4647-b325-b04c6b44874b,af01a385-31d3-3c77-8fdb-2867fe88df2f,AMBULATORY,129.16,59.16,45.0,2,516.65,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,52,SINGLE,WHITE,NONHISPANIC,F,M,2585,0,NaT,NaT,2012-04-18 15:17:08,174.0,9,0.458037
35555,2acc0c7e-02b4-4f7c-8b05-117edbc29839,2014-10-18 07:23:24,2014-10-18 07:53:24,7e3becd0-0053-4d2e-b57e-075b874c32a6,58b66cc1-2b86-377f-ad77-ad8164388e50,AMBULATORY,129.16,69.16,30.0,1,1686.55,NaT,NaT,0,0.0,0.0,0,2014-10-18,2014-10-28,0,1,,,-1,0,22,UNKNOWN,WHITE,NONHISPANIC,F,F,1811,0,10 days,NaT,2015-01-20 07:23:24,93.0,9,0.53546


In [76]:
data['medication_duration'] = (data['medication_end'] - data['medication_start']).dt.days

In [77]:
data.drop(columns=['encounter','encounter_start','encounter_finish','patient_id','provider_id','base_encounter_cost','payer_coverage','next_admission_date'],inplace=True)

In [78]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration
34106,WELLNESS,30.0,1,648.01,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT,370.0,1,1.0,
34107,WELLNESS,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT,261.0,2,1.0,
34108,AMBULATORY,30.0,1,2230.4,NaT,NaT,0,0.0,0.0,0,2012-08-03,2012-08-11,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,8 days,NaT,71.0,3,0.419325,
34109,AMBULATORY,15.0,0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1,20.48,20.48,1,2012-10-14,2012-10-25,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,11 days,NaT,36.0,4,0.419325,11.0
34110,WELLNESS,30.0,1,590.58,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT,370.0,5,1.0,


In [79]:
def clean_timedelta_columns(df):
    data = df.copy()
    
    duration_cols = ['condition_duration', 'careplan_duration', 'medication_duration']
    
    for col in duration_cols:
        if col in data.columns:
            data[col] = pd.to_timedelta(data[col], errors='coerce')
            data[col] = data[col].dt.days
            
    print("✅ Converted durations to numeric days. NaT replaced with NaN.")
    return data

In [80]:
data = clean_timedelta_columns(data)

✅ Converted durations to numeric days. NaT replaced with NaN.


In [81]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration
34106,WELLNESS,30.0,1,648.01,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,,,370.0,1,1.0,
34107,WELLNESS,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,,,261.0,2,1.0,
34108,AMBULATORY,30.0,1,2230.4,NaT,NaT,0,0.0,0.0,0,2012-08-03,2012-08-11,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,8.0,,71.0,3,0.419325,
34109,AMBULATORY,15.0,0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1,20.48,20.48,1,2012-10-14,2012-10-25,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,11.0,,36.0,4,0.419325,0.0
34110,WELLNESS,30.0,1,590.58,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,,,370.0,5,1.0,


In [82]:
def add_smart_duration(df):
    data = df.copy()
    
    cols = [('medication_start', 'medication_end', 'medication_duration'),
            ('condition_start', 'condition_end', 'condition_duration'),
            ('careplan_start', 'careplan_end', 'careplan_duration')]
            
    for start, end, duration_col in cols:
        if start in data.columns and end in data.columns:
            # Convert to datetime
            s = pd.to_datetime(data[start])
            e = pd.to_datetime(data[end])
            
            # 2. Calculate Days AND Add 1 (The "Inclusive" Rule)
            # This turns "Same Day" (0) into 1
            data[duration_col] = (e - s).dt.days + 1
            
            # 3. NOW fill the NaNs with 0
            # This ensures Healthy people stay 0, while "Same Day" people are 1
            data[duration_col] = data[duration_col].fillna(0)
            
    return data

In [83]:
data = add_smart_duration(data)

In [84]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration
34106,WELLNESS,30.0,1,648.01,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,370.0,1,1.0,0.0
34107,WELLNESS,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,261.0,2,1.0,0.0
34108,AMBULATORY,30.0,1,2230.4,NaT,NaT,0,0.0,0.0,0,2012-08-03,2012-08-11,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,9.0,0.0,71.0,3,0.419325,0.0
34109,AMBULATORY,15.0,0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1,20.48,20.48,1,2012-10-14,2012-10-25,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,12.0,0.0,36.0,4,0.419325,12.0
34110,WELLNESS,30.0,1,590.58,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,370.0,5,1.0,0.0


In [85]:
data.drop(columns = ['medication_start','medication_end','condition_start','condition_end','careplan_start','careplan_end'],inplace = True)

In [86]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,is_ongoing_condition,total_conditions,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration
34106,WELLNESS,30.0,1,648.01,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,370.0,1,1.0,0.0
34107,WELLNESS,15.0,0,0.0,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,261.0,2,1.0,0.0
34108,AMBULATORY,30.0,1,2230.4,0,0.0,0.0,0,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,9.0,0.0,71.0,3,0.419325,0.0
34109,AMBULATORY,15.0,0,0.0,1,20.48,20.48,1,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,12.0,0.0,36.0,4,0.419325,12.0
34110,WELLNESS,30.0,1,590.58,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,370.0,5,1.0,0.0


In [87]:
def add_procedures_per_hour(df):
    data = df.copy()
    
    raw_hours = data['encounter_duration'] / 60
    effective_hours = raw_hours.clip(lower=1.0)
    
    data['procedures_per_hour'] = data['total_procedures'] / effective_hours
    
    return data

In [88]:
data = add_procedures_per_hour(data)

In [89]:
data.sample(5)

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,is_ongoing_condition,total_conditions,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration,procedures_per_hour
5175,URGENTCARE,15.0,0,0.0,0,0.0,0.0,0,-1,0,-1,0,70,SINGLE,WHITE,NONHISPANIC,M,F,420,1,0.0,0.0,27.0,299,0.0,0.0,0.0
46756,AMBULATORY,15.0,0,0.0,0,0.0,0.0,0,0,1,-1,0,59,MARRIED,WHITE,NONHISPANIC,M,M,4466,0,15.0,0.0,399.0,6,0.0,0.0,0.0
48279,INPATIENT,1620.0,1,22661.46,2,358.52,358.52,1,-1,0,-1,0,84,MARRIED,WHITE,NONHISPANIC,M,M,310,1,0.0,0.0,23.0,81,0.690307,1.0,0.037037
4061,WELLNESS,15.0,0,0.0,0,0.0,0.0,0,-1,0,-1,0,50,MARRIED,BLACK,NONHISPANIC,F,M,44,0,0.0,0.0,-1.0,16,1.0,0.0,0.0
486,AMBULATORY,50.0,1,13035.43,0,0.0,0.0,0,-1,0,-1,0,62,MARRIED,WHITE,NONHISPANIC,F,F,2159,0,0.0,0.0,267.0,65,0.419325,0.0,1.0


In [90]:
data[['condition_duration','careplan_duration','days_to_readmission','medication_duration']] = data[['condition_duration','careplan_duration','days_to_readmission','medication_duration']].astype('Int64')

In [91]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,is_ongoing_condition,total_conditions,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration,procedures_per_hour
34106,WELLNESS,30.0,1,648.01,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0,0,370,1,1.0,0,1.0
34107,WELLNESS,15.0,0,0.0,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0,0,261,2,1.0,0,0.0
34108,AMBULATORY,30.0,1,2230.4,0,0.0,0.0,0,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,9,0,71,3,0.419325,0,1.0
34109,AMBULATORY,15.0,0,0.0,1,20.48,20.48,1,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,12,0,36,4,0.419325,12,0.0
34110,WELLNESS,30.0,1,590.58,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0,0,370,5,1.0,0,1.0


In [92]:
data['coverage_ratio']= data['coverage_ratio'].round(2)

In [93]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,is_ongoing_condition,total_conditions,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,days_to_readmission,visit_number,coverage_ratio,medication_duration,procedures_per_hour
34106,WELLNESS,30.0,1,648.01,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0,0,370,1,1.0,0,1.0
34107,WELLNESS,15.0,0,0.0,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0,0,261,2,1.0,0,0.0
34108,AMBULATORY,30.0,1,2230.4,0,0.0,0.0,0,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,9,0,71,3,0.42,0,1.0
34109,AMBULATORY,15.0,0,0.0,1,20.48,20.48,1,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,12,0,36,4,0.42,12,0.0
34110,WELLNESS,30.0,1,590.58,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0,0,370,5,1.0,0,1.0


In [94]:
data.isnull().sum()[data.isnull().sum()>0].reset_index(name = 'null_counts')

Unnamed: 0,index,null_counts
0,days_to_readmission,1171


In [95]:
data['days_to_readmission']=data['days_to_readmission'].fillna(0)

In [96]:
data['is_ongoing_condition']=np.where(data['is_ongoing_condition']== -1,'NO CONDITION',np.where(data['is_ongoing_condition']==0,'NO','YES'))

In [97]:
data['is_ongoing_careplan']=np.where(data['is_ongoing_careplan']== -1,'NO CAREPLAN',np.where(data['is_ongoing_careplan']==0,'NO','YES'))

In [98]:
data['caused_readmission']=np.where(data['caused_readmission'] == 1,'YES','NO')

In [99]:
data['procedures_per_hour']= data['procedures_per_hour'].round(2)

In [102]:
data.drop(columns=['days_to_readmission'],inplace=True)

## **Exporting the data**

In [103]:
file_path = data_path / "data" / "processed_data" / "processed_synthea_data.csv"

In [104]:
data.to_csv(file_path,index=False)