# ***NOTEBOOK FOR PREPARING THE DATA***

In [4]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
def get_project_root(project_name="portfolio_projects"):
    current_path = Path.cwd()
    if project_name in str(current_path):
        while current_path.name != project_name:
            current_path = current_path.parent
        return current_path
    else:
        for path in current_path.rglob(project_name):
            if path.is_dir():
                return path
        raise FileNotFoundError(f"Could not find project: {project_name}")

repo_root = get_project_root()
data_path = repo_root / "about_synthea_patient_readmission"

sys.path.append(str(data_path))

import src.functions as F

In [6]:
engine = F.get_engine()

AttributeError: module 'src.functions' has no attribute 'get_engine'

In [None]:
engine

Engine(mysql+pymysql://root:***@localhost:3306/synthea_medical_dataset)

# pREPARATIO OF AGG DATA

In [None]:
necessary_tables = ['encounters','careplans','conditions','medications','procedures','patients','providers']

In [None]:
F.select('encounters').head(1)

Unnamed: 0,Id,start,stop,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e002090d-4e92-300e-b41e-7d1f21dee4c6,e6283e46-fd81-3611-9459-0edb1c3da357,6e2f1a2d-27bd-3701-8d08-dae202c58632,ambulatory,185345009,Encounter for symptom,129.16,129.16,54.16,10509002.0,Acute bronchitis (disorder),25.0


In [None]:
F.find_nulls('encounters')

Unnamed: 0,column_names,null_values
11,REASONCODE,39569
12,REASONDESCRIPTION,39569


In [None]:
F.sql('select count(distinct code) as total_code, count(distinct description) as total_description from encounters')

Unnamed: 0,total_code,total_description
0,43,49


In [None]:
F.sql('select count(distinct reasoncode) as total_code, count(distinct reasondescription) as total_description from encounters')

Unnamed: 0,total_code,total_description
0,80,80


In [None]:
F.select('encounters').head(1)

Unnamed: 0,Id,start,stop,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e002090d-4e92-300e-b41e-7d1f21dee4c6,e6283e46-fd81-3611-9459-0edb1c3da357,6e2f1a2d-27bd-3701-8d08-dae202c58632,ambulatory,185345009,Encounter for symptom,129.16,129.16,54.16,10509002.0,Acute bronchitis (disorder),25.0


In [None]:
encounters_agg = F.sql('''
    select id as encounter,
    start,stop,patient,provider,encounterclass,base_encounter_cost,payer_coverage,duration
                       from encounters
''')

In [None]:
encounters_agg.head()

Unnamed: 0,encounter,start,stop,patient,provider,encounterclass,base_encounter_cost,payer_coverage,duration
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e6283e46-fd81-3611-9459-0edb1c3da357,ambulatory,129.16,54.16,25.0
1,e88bc3a9-007c-405e-aabc-792a38f4aa2b,2012-01-23 17:45:28,2012-01-23 18:00:28,034e9e3b-2def-4559-bb2a-7850888ae060,6f1d59a7-a5bd-3cf9-9671-5bad2f351c28,wellness,129.16,129.16,15.0
2,8f104aa7-4ca9-4473-885a-bba2437df588,2001-05-01 15:02:18,2001-05-01 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,af01a385-31d3-3c77-8fdb-2867fe88df2f,ambulatory,129.16,0.0,15.0
3,b85c339a-6076-43ed-b9d0-9cf013dec49d,2011-07-28 15:02:18,2011-07-28 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,bb17e691-262b-3546-93d5-d88e7de93246,wellness,129.16,0.0,15.0
4,dae2b7cb-1316-4b78-954f-fa610a6c6d0e,2010-07-27 12:58:08,2010-07-27 13:28:08,10339b10-3cd1-4ac3-ac13-ec26728cb592,7ed6b84a-b847-3744-9d42-15c42297a0c2,wellness,129.16,129.16,30.0


In [None]:
F.select('careplans')

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,d2500b8c-e830-433a-8b9d-368d30741520,2010-01-23,2012-01-23,034e9e3b-2def-4559-bb2a-7850888ae060,d0c40d10-8d87-447e-836e-99d26ad52ea5,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
1,07d9ddd8-dfa1-4e43-9bfe-39f63f4ace15,2011-05-13,2011-08-02,10339b10-3cd1-4ac3-ac13-ec26728cb592,e1ab4933-07a1-49f0-b4bd-05500919061d,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
2,a3bb6e99-3b99-44b3-974c-e230b4511b5c,2011-12-31,2012-11-30,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,16300c56-a035-4126-a656-68c093da6dfc,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
3,9f5284b7-425a-486a-b36e-ab818c018f2f,2016-12-29,2017-01-05,034e9e3b-2def-4559-bb2a-7850888ae060,3b639086-5fbc-4720-8c31-e8c8c0f1d660,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
4,47ede16c-c216-4f81-a16b-0e858de9cdc3,2017-01-22,2017-02-12,10339b10-3cd1-4ac3-ac13-ec26728cb592,4ec8d55b-05fc-42a5-bfa3-1e233874a362,225358003,Wound care,284551006.0,Laceration of foot


In [None]:
query = '''
    select encounter,
    min(start) as careplan_start,
    max(stop) as careplan_end,
    case when max(stop) is null then 1 else 0 end as is_ongoing_careplan,
    count(distinct code) as total_careplans
    from careplans
    group by encounter
'''
careplan_agg = F.sql(query)

In [None]:
careplan_agg.head()

Unnamed: 0,encounter,careplan_start,careplan_end,is_ongoing_careplan,total_careplans
0,000fef3b-ba44-4b01-952e-7eeb8a5732a8,1976-07-30,1976-10-05,0,1
1,001d66d9-b1bc-425b-a3a7-3c24de8fcb16,2010-12-29,2011-08-03,0,1
2,001ed2aa-e772-4407-9902-38ef70bd1a1e,2011-10-06,2011-11-05,0,1
3,005ba3e5-d0f6-4bed-a1e7-a309cac30be2,2001-07-18,,1,1
4,006f8752-ea4b-4cd3-bda5-9ae9aede145d,2001-08-10,2001-08-17,0,1


In [None]:
F.select('conditions')

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2001-05-01,,1d604da9-9a81-4ba9-80c2-de3375d59b40,8f104aa7-4ca9-4473-885a-bba2437df588,40055000,Chronic sinusitis (disorder)
1,2011-08-09,2011-08-16,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,9d35ec9f-352a-4629-92ef-38eae38437e7,444814009,Viral sinusitis (disorder)
2,2011-11-16,2011-11-26,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,ae7555a9-eaff-4c09-98a7-21bc6ed1b1fd,195662009,Acute viral pharyngitis (disorder)
3,2011-05-13,2011-05-27,10339b10-3cd1-4ac3-ac13-ec26728cb592,e1ab4933-07a1-49f0-b4bd-05500919061d,10509002,Acute bronchitis (disorder)
4,2011-02-06,2011-02-14,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,b8f76eba-7795-4dcd-a544-f27ac2ef3d46,195662009,Acute viral pharyngitis (disorder)


In [None]:
query = '''
    select encounter,
    min(start) as condition_start,
    max(stop) as condition_end,
    case when max(stop) is null then 1 else 0 end as is_ongoing_condition,
    count(distinct code) as total_conditions
    from conditions
    group by encounter
'''
condition_agg = F.sql(query)

In [None]:
condition_agg.head()

Unnamed: 0,encounter,condition_start,condition_end,is_ongoing_condition,total_conditions
0,000fef3b-ba44-4b01-952e-7eeb8a5732a8,1976-07-30,1976-10-05,0,1
1,001d66d9-b1bc-425b-a3a7-3c24de8fcb16,2010-12-29,2011-08-03,0,1
2,001ed2aa-e772-4407-9902-38ef70bd1a1e,2011-10-06,2011-11-05,0,1
3,002ad252-491e-414f-b5a1-6aa92adc6ced,2019-03-03,2019-03-12,0,1
4,003126a6-95e0-4611-9d89-8bd1d00a19d7,2013-03-31,2013-04-07,0,1


In [None]:
F.select('medications').head(3)

Unnamed: 0,start,stop,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION
0,2010-05-05 00:26:23,2011-04-30 00:26:23,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,1e0d6b0e-1711-4a25-99f9-b1c700c9b260,389221,Etonogestrel 68 MG Drug Implant,677.08,0.0,12,8124.96,,
1,2011-04-30 00:26:23,2012-04-24 00:26:23,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,6aa37300-d1b4-48e7-a2f8-5e0f70f48f38,389221,Etonogestrel 68 MG Drug Implant,624.09,0.0,12,7489.08,,
2,2012-04-24 00:26:23,2013-04-19 00:26:23,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,7253a9f9-6f6d-429a-926a-7b1d424eae3f,748856,Yaz 28 Day Pack,43.32,0.0,12,519.84,,


In [None]:
query = '''
    select encounter,
    min(start) as medication_start,
    max(stop) as medication_end,
    count(distinct code) as total_medicines,
    round(avg(base_cost),2) as avg_base_medicine_cost,
    round(avg(totalcost),2) as avg_total_medicine_cost,
    count(distinct reasoncode) as reasons_for_medications 
    from medications
    group by encounter;
'''
medicine_agg = F.sql(query)

In [None]:
medicine_agg.head()

Unnamed: 0,encounter,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications
0,0000d0b7-937c-498a-9da4-32a5d29dee39,1994-09-07 04:48:12,1995-03-15 04:48:12,2,270.56,1623.33,2
1,0004e2e7-e3f2-4d25-b3eb-813710f2f9bc,2001-11-11 02:07:45,2002-11-17 02:07:45,1,263.49,3161.88,1
2,00052e41-7581-46e1-8c7f-d31bbb50ae72,2000-03-03 08:37:29,2000-05-05 08:37:29,2,433.29,866.59,1
3,00055b87-0a03-4ca8-a69d-924bbb84033a,1976-09-07 09:30:42,1976-09-07 09:30:42,1,263.49,263.49,1
4,0005b0a0-1b05-40ec-a741-11c116654840,2018-02-20 01:11:43,2019-02-15 01:11:43,1,399.22,4790.64,0


In [None]:
F.select('procedures')

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
0,2011-04-30T00:26:23Z,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,6aa37300-d1b4-48e7-a2f8-5e0f70f48f38,169553002,Insertion of subcutaneous contraceptive,14896.56,,
1,2010-07-27T12:58:08Z,10339b10-3cd1-4ac3-ac13-ec26728cb592,dae2b7cb-1316-4b78-954f-fa610a6c6d0e,430193006,Medication Reconciliation (procedure),726.51,,
2,2010-11-20T03:04:34Z,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,7ff86631-0378-4bfc-92ce-1edd697eb18e,430193006,Medication Reconciliation (procedure),788.5,,
3,2011-02-07T03:04:34Z,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,b8f76eba-7795-4dcd-a544-f27ac2ef3d46,117015009,Throat culture (procedure),2070.44,195662009.0,Acute viral pharyngitis (disorder)
4,2011-04-19T03:04:34Z,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,640837d9-845a-433c-9fad-47426664a69d,117015009,Throat culture (procedure),2479.39,195662009.0,Acute viral pharyngitis (disorder)


In [None]:
query = '''
    select encounter, 
    count(distinct code) as total_procedures,
    round(avg(base_cost),2) as avg_procedure_cost
    from procedures
    group by encounter
'''
procedure_agg = F.sql(query)

In [None]:
procedure_agg.head()

Unnamed: 0,encounter,total_procedures,avg_procedure_cost
0,000186d2-1316-4b58-be65-272233953fcb,5,3652.09
1,0002adbb-59c3-494a-bb17-0d1acc692ae8,1,12161.14
2,00055b87-0a03-4ca8-a69d-924bbb84033a,1,516.65
3,0005b0a0-1b05-40ec-a741-11c116654840,1,4145.59
4,000c97b3-5832-4ecc-8a47-76c0197fffe4,1,575.24


In [None]:
F.select('patients')

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
0,1d604da9-9a81-4ba9-80c2-de3375d59b40,1989-05-25,,999-76-6866,S99984236,X19277260X,Mr.,José Eduardo181,Gómez206,,...,Marigot Saint Andrew Parish DM,427 Balistreri Way Unit 19,Chicopee,Massachusetts,Hampden County,1013.0,42.228354,-72.562951,271227.08,1334.88
1,034e9e3b-2def-4559-bb2a-7850888ae060,1983-11-14,,999-73-5361,S99962402,X88275464X,Mr.,Milo271,Feil794,,...,Danvers Massachusetts US,422 Farrell Path Unit 69,Somerville,Massachusetts,Middlesex County,2143.0,42.360697,-71.126531,793946.01,3204.49
2,10339b10-3cd1-4ac3-ac13-ec26728cb592,1992-06-02,,999-27-3385,S99972682,X73754411X,Mr.,Jayson808,Fadel536,,...,Springfield Massachusetts US,1056 Harris Lane Suite 70,Chicopee,Massachusetts,Hampden County,1020.0,42.181642,-72.608842,574111.9,2606.4
3,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,1978-05-27,,999-85-4926,S99974448,X40915583X,Mrs.,Mariana775,Rutherford999,,...,Yarmouth Massachusetts US,999 Kuhn Forge,Lowell,Massachusetts,Middlesex County,1851.0,42.636143,-71.343255,935630.3,8756.19
4,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,1996-10-18,,999-60-7372,S99915787,X86772962X,Mr.,Gregorio366,Auer97,,...,Patras Achaea GR,1050 Lindgren Extension Apt 38,Boston,Massachusetts,Suffolk County,2135.0,42.352434,-71.02861,598763.07,3772.2


In [None]:
query = '''
    select
    id as patient_id,
    case when 
    deathdate is null then timestampdiff(year,birthdate,"2021-12-31") 
    else timestampdiff(year,birthdate,deathdate)
    end as age_as_2022,
    marital,
    race,
    ethnicity,
    gender as patient_gender
    from patients;
'''
patient_agg = F.sql(query)

In [None]:
patient_agg.head()

Unnamed: 0,patient_id,age_as_2022,marital,race,ethnicity,patient_gender
0,1d604da9-9a81-4ba9-80c2-de3375d59b40,32,M,white,hispanic,M
1,034e9e3b-2def-4559-bb2a-7850888ae060,38,M,white,nonhispanic,M
2,10339b10-3cd1-4ac3-ac13-ec26728cb592,29,M,white,nonhispanic,M
3,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,43,M,white,nonhispanic,F
4,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,25,,white,nonhispanic,M


In [None]:
F.select('providers')

Unnamed: 0,Id,ORGANIZATION,NAME,GENDER,SPECIALITY,ADDRESS,CITY,STATE,ZIP,LAT,LON,UTILIZATION
0,3421aa75-dec7-378d-a9e0-0bc764e4cb0d,ef58ea08-d883-3957-8300-150554edc8fb,Tomas436 Sauer652,M,GENERAL PRACTICE,60 HOSPITAL ROAD,LEOMINSTER,MA,1453,42.520838,-71.770876,1557
1,c9b3c857-2e24-320c-a79a-87b8a60de63c,69176529-fd1f-3b3f-abce-a0a3626769eb,Suzette512 Monahan736,F,GENERAL PRACTICE,330 MOUNT AUBURN STREET,CAMBRIDGE,MA,2138,42.375967,-71.118275,2296
2,0359f968-d1a6-30eb-b1cc-e6cc0b4d3513,5e765f2b-e908-3888-9fc7-df2cb87beb58,Gaynell126 Streich926,F,GENERAL PRACTICE,211 PARK STREET,ATTLEBORO,MA,2703,41.931653,-71.294503,2287
3,446d1609-858f-3a54-8a52-0c4eacedd00e,f1fbcbfb-fcfa-3bd2-b7f4-df20f1b3c3a4,Patricia625 Salgado989,F,GENERAL PRACTICE,ONE GENERAL STREET,LAWRENCE,MA,1842,42.700273,-71.161357,1327
4,e6283e46-fd81-3611-9459-0edb1c3da357,e002090d-4e92-300e-b41e-7d1f21dee4c6,Jeanmarie510 Beatty507,F,GENERAL PRACTICE,1493 CAMBRIDGE STREET,CAMBRIDGE,MA,2138,42.375967,-71.118275,3199


In [None]:
query = '''
    select id as provider_id,
    gender as provider_gender,
    utilization 
    from providers;
'''
provider_agg = F.sql(query)

In [None]:
provider_agg.head()

Unnamed: 0,provider_id,provider_gender,utilization
0,3421aa75-dec7-378d-a9e0-0bc764e4cb0d,M,1557
1,c9b3c857-2e24-320c-a79a-87b8a60de63c,F,2296
2,0359f968-d1a6-30eb-b1cc-e6cc0b4d3513,F,2287
3,446d1609-858f-3a54-8a52-0c4eacedd00e,F,1327
4,e6283e46-fd81-3611-9459-0edb1c3da357,F,3199


In [None]:
agg_tables = [encounters_agg,patient_agg,careplan_agg,procedure_agg,condition_agg,medicine_agg,provider_agg]

In [None]:
encounters_agg.columns = encounters_agg.columns.str.lower()

In [None]:
encounters_agg.rename(columns={'patient':'patient_id','provider':'provider_id'},inplace=True)

In [None]:
data = (encounters_agg.merge(procedure_agg,on='encounter',how='left')
    .merge(medicine_agg,on='encounter',how='left')
    .merge(condition_agg,on='encounter',how='left')
    .merge(careplan_agg,on='encounter',how='left')
    .merge(patient_agg,on='patient_id',how='left')
    .merge(provider_agg,on='provider_id',how='left'))

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
data.head()

Unnamed: 0,encounter,start,stop,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,2010-01-23 17:45:28,2010-01-23 18:10:28,034e9e3b-2def-4559-bb2a-7850888ae060,e6283e46-fd81-3611-9459-0edb1c3da357,ambulatory,129.16,54.16,25.0,,,NaT,NaT,,,,,,,,,2010-01-23,2012-01-23,0.0,1.0,38,M,white,nonhispanic,M,F,3199
1,e88bc3a9-007c-405e-aabc-792a38f4aa2b,2012-01-23 17:45:28,2012-01-23 18:00:28,034e9e3b-2def-4559-bb2a-7850888ae060,6f1d59a7-a5bd-3cf9-9671-5bad2f351c28,wellness,129.16,129.16,15.0,,,NaT,NaT,,,,,,,,,,,,,38,M,white,nonhispanic,M,F,33
2,8f104aa7-4ca9-4473-885a-bba2437df588,2001-05-01 15:02:18,2001-05-01 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,af01a385-31d3-3c77-8fdb-2867fe88df2f,ambulatory,129.16,0.0,15.0,,,NaT,NaT,,,,,2001-05-01,,1.0,1.0,,,,,32,M,white,hispanic,M,M,2585
3,b85c339a-6076-43ed-b9d0-9cf013dec49d,2011-07-28 15:02:18,2011-07-28 15:17:18,1d604da9-9a81-4ba9-80c2-de3375d59b40,bb17e691-262b-3546-93d5-d88e7de93246,wellness,129.16,0.0,15.0,,,NaT,NaT,,,,,,,,,,,,,32,M,white,hispanic,M,F,58
4,dae2b7cb-1316-4b78-954f-fa610a6c6d0e,2010-07-27 12:58:08,2010-07-27 13:28:08,10339b10-3cd1-4ac3-ac13-ec26728cb592,7ed6b84a-b847-3744-9d42-15c42297a0c2,wellness,129.16,129.16,30.0,1.0,726.51,NaT,NaT,,,,,,,,,,,,,29,M,white,nonhispanic,M,M,59


In [None]:
# 1. ADDING READMISSION COLUMN
data['start'] = pd.to_datetime(data['start'])
data = data.sort_values(['patient_id', 'start'])

# 2. SELF-MERGE USING data
dup = data.merge(
    data,
    on='patient_id',
    how='inner',
    suffixes=('_first', '_second')
)

# 3. FILTER A → B WITHIN 30 DAYS
dup = dup[
    (dup['start_second'] > dup['start_first']) &
    (dup['start_second'] <= dup['start_first'] + pd.Timedelta(days=30))
]

# 4. ADD COLUMN DIRECTLY TO data
data['caused_readmission'] = 0
data.loc[data['encounter'].isin(dup['encounter_first']), 'caused_readmission'] = 1


In [None]:
data.head()

Unnamed: 0,encounter,start,stop,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,wellness,129.16,129.16,30.0,1.0,648.01,NaT,NaT,,,,,,,,,,,,,18,,white,nonhispanic,M,M,25,0
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,wellness,129.16,129.16,15.0,,,NaT,NaT,,,,,,,,,,,,,18,,white,nonhispanic,M,M,25,0
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,ambulatory,129.16,54.16,30.0,1.0,2230.4,NaT,NaT,,,,,2012-08-03,2012-08-11,0.0,1.0,,,,,18,,white,nonhispanic,M,M,1616,0
34109,ce150f69-e3a6-4793-95b6-243f754723c3,2012-10-14 15:06:37,2012-10-14 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,ambulatory,129.16,54.16,15.0,,,2012-10-14 15:06:37,2012-10-25 15:06:37,1.0,20.48,20.48,1.0,2012-10-14,2012-10-25,0.0,1.0,,,,,18,,white,nonhispanic,M,M,1616,0
34110,e905ce81-d1da-46ac-a0fb-c16bce2c77b7,2012-11-20 15:06:37,2012-11-20 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,wellness,129.16,129.16,30.0,1.0,590.58,NaT,NaT,,,,,,,,,,,,,18,,white,nonhispanic,M,M,25,0


In [None]:
data.rename(columns={'start':'encounter_start','stop':'encounter_finish'},inplace=True)

In [None]:
data.isnull().sum()[data.isnull().sum()>0].reset_index(name = 'null_counts')

Unnamed: 0,index,null_counts
0,total_procedures,32274
1,avg_procedure_cost,32274
2,medication_start,26077
3,medication_end,27309
4,total_medicines,26077
5,avg_base_medicine_cost,26077
6,avg_total_medicine_cost,26077
7,reasons_for_medications,26077
8,condition_start,45696
9,condition_end,48884


In [None]:
data.columns

Index(['encounter', 'encounter_start', 'encounter_finish', 'patient_id',
       'provider_id', 'encounterclass', 'base_encounter_cost',
       'payer_coverage', 'duration', 'total_procedures', 'avg_procedure_cost',
       'medication_start', 'medication_end', 'total_medicines',
       'avg_base_medicine_cost', 'avg_total_medicine_cost',
       'reasons_for_medications', 'condition_start', 'condition_end',
       'is_ongoing_condition', 'total_conditions', 'careplan_start',
       'careplan_end', 'is_ongoing_careplan', 'total_careplans', 'age_as_2022',
       'marital', 'race', 'ethnicity', 'patient_gender', 'provider_gender',
       'utilization', 'caused_readmission'],
      dtype='object')

In [None]:
cols = ['total_procedures', 'avg_procedure_cost', 'total_medicines','avg_base_medicine_cost', 'avg_total_medicine_cost',
       'reasons_for_medications', 'total_conditions', 'total_careplans']

for col in cols:
    data[col] = data[col].fillna(0)

In [None]:
data[['marital']] =data[['marital']].fillna('unknown')

In [None]:
cols = ['is_ongoing_condition', 'is_ongoing_careplan']

for col in cols:
    data[col] = data[col].fillna(-1)

In [None]:
data.isnull().sum()[data.isnull().sum()>0].reset_index(name = 'null_counts')

Unnamed: 0,index,null_counts
0,medication_start,26077
1,medication_end,27309
2,condition_start,45696
3,condition_end,48884
4,careplan_start,49874
5,careplan_end,51395


In [None]:
data['marital']=np.where(data['marital'] =='M','MARRIED',np.where(data['marital']=='S','SINGLE','UNKNOWN'))

In [None]:
cols = ['encounterclass','race','ethnicity']
for col in cols:
    data[col] = data[col].apply(lambda x: x.upper())

In [None]:
data.head()

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1.0,648.01,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,15.0,0.0,0.0,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,30.0,1.0,2230.4,NaT,NaT,0.0,0.0,0.0,0.0,2012-08-03,2012-08-11,0.0,1.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0
34109,ce150f69-e3a6-4793-95b6-243f754723c3,2012-10-14 15:06:37,2012-10-14 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,15.0,0.0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1.0,20.48,20.48,1.0,2012-10-14,2012-10-25,0.0,1.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0
34110,e905ce81-d1da-46ac-a0fb-c16bce2c77b7,2012-11-20 15:06:37,2012-11-20 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1.0,590.58,NaT,NaT,0.0,0.0,0.0,0.0,,,-1.0,0.0,,,-1.0,0.0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0


In [None]:
data['condition_duration']=pd.to_datetime(data['condition_end']) - pd.to_datetime(data['condition_start'])

In [None]:
data['careplan_duration']=pd.to_datetime(data['careplan_end']) - pd.to_datetime(data['careplan_start'])

In [None]:
data.rename(columns={'duration':'encounter_duration'},inplace =True)

In [None]:
cols = ['total_procedures','total_medicines','reasons_for_medications','is_ongoing_condition','total_conditions','is_ongoing_careplan','total_careplans']

for col in cols:
    data[col] = data[col].astype('Int64')

In [None]:
data.head()

Unnamed: 0,encounter,encounter_start,encounter_finish,patient_id,provider_id,encounterclass,base_encounter_cost,payer_coverage,encounter_duration,total_procedures,avg_procedure_cost,medication_start,medication_end,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,condition_start,condition_end,is_ongoing_condition,total_conditions,careplan_start,careplan_end,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration
34106,6b5bfe89-1c58-42e8-87c4-847b542d5f0b,2010-11-09 15:06:37,2010-11-09 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1,648.01,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT
34107,e0a65c0f-fa38-46aa-bd00-60f4473230e2,2011-11-15 15:06:37,2011-11-15 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,15.0,0,0.0,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT
34108,187f0326-5342-4b78-8818-db5418f9300b,2012-08-03 15:06:37,2012-08-03 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,30.0,1,2230.4,NaT,NaT,0,0.0,0.0,0,2012-08-03,2012-08-11,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,8 days,NaT
34109,ce150f69-e3a6-4793-95b6-243f754723c3,2012-10-14 15:06:37,2012-10-14 15:21:37,00185faa-2760-4218-9bf5-db301acf8274,b08f34d7-3c08-35b9-a2ba-b823511ebc57,AMBULATORY,129.16,54.16,15.0,0,0.0,2012-10-14 15:06:37,2012-10-25 15:06:37,1,20.48,20.48,1,2012-10-14,2012-10-25,0,1,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,11 days,NaT
34110,e905ce81-d1da-46ac-a0fb-c16bce2c77b7,2012-11-20 15:06:37,2012-11-20 15:36:37,00185faa-2760-4218-9bf5-db301acf8274,fe3d1004-b6e9-3fd8-bab5-8ba4cf7d7d95,WELLNESS,129.16,129.16,30.0,1,590.58,NaT,NaT,0,0.0,0.0,0,,,-1,0,,,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,NaT,NaT


In [None]:
def add_visit_number(df):
    data = df.copy()
    data = data.sort_values(by=['patient_id', 'encounter_start'])
    data['visit_number'] = data.groupby('patient_id').cumcount() + 1
    
    return data

In [None]:
data = add_visit_number(data)

In [None]:
def add_coverage_ratio(df):
    data = df.copy()
    
    # Avoid division by zero
    data['coverage_ratio'] = data.apply(
        lambda x: x['payer_coverage'] / x['base_encounter_cost'] 
        if x['base_encounter_cost'] > 0 else 0.0, 
        axis=1
    )
    
    data['coverage_ratio'] = data['coverage_ratio'].clip(upper=1.0)
    
    return data

In [None]:
data = add_coverage_ratio(data)

In [None]:
data['medication_duration'] = (data['medication_end'] - data['medication_start']).dt.days

In [None]:
def clean_timedelta_columns(df):
    data = df.copy()
    
    duration_cols = ['condition_duration', 'careplan_duration', 'medication_duration']
    
    for col in duration_cols:
        if col in data.columns:
            data[col] = pd.to_timedelta(data[col], errors='coerce')
            data[col] = data[col].dt.days
            
    print("✅ Converted durations to numeric days. NaT replaced with NaN.")
    return data

In [None]:
data = clean_timedelta_columns(data)

✅ Converted durations to numeric days. NaT replaced with NaN.


In [None]:
def add_smart_duration(df):
    data = df.copy()
    
    cols = [('medication_start', 'medication_end', 'medication_duration'),
            ('condition_start', 'condition_end', 'condition_duration'),
            ('careplan_start', 'careplan_end', 'careplan_duration')]
            
    for start, end, duration_col in cols:
        if start in data.columns and end in data.columns:
            # Convert to datetime
            s = pd.to_datetime(data[start])
            e = pd.to_datetime(data[end])
            
            # 2. Calculate Days AND Add 1 (The "Inclusive" Rule)
            # This turns "Same Day" (0) into 1
            data[duration_col] = (e - s).dt.days + 1
            
            # 3. NOW fill the NaNs with 0
            # This ensures Healthy people stay 0, while "Same Day" people are 1
            data[duration_col] = data[duration_col].fillna(0)
            
    return data

In [None]:
data = add_smart_duration(data)

In [None]:
def add_procedures_per_hour(df):
    data = df.copy()
    
    raw_hours = data['encounter_duration'] / 60
    effective_hours = raw_hours.clip(lower=1.0)
    
    data['procedures_per_hour'] = data['total_procedures'] / effective_hours
    
    return data

In [None]:
data = add_procedures_per_hour(data)

In [None]:
cols = ['encounter','encounter_start','encounter_finish','patient_id','provider_id','base_encounter_cost','payer_coverage','medication_start','medication_end','condition_start','condition_end','careplan_start','careplan_end']


data.drop(columns=cols,inplace=True)

In [None]:
data.head()

Unnamed: 0,encounterclass,encounter_duration,total_procedures,avg_procedure_cost,total_medicines,avg_base_medicine_cost,avg_total_medicine_cost,reasons_for_medications,is_ongoing_condition,total_conditions,is_ongoing_careplan,total_careplans,age_as_2022,marital,race,ethnicity,patient_gender,provider_gender,utilization,caused_readmission,condition_duration,careplan_duration,visit_number,coverage_ratio,medication_duration,procedures_per_hour
34106,WELLNESS,30.0,1,648.01,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,1,1.0,0.0,1.0
34107,WELLNESS,15.0,0,0.0,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,2,1.0,0.0,0.0
34108,AMBULATORY,30.0,1,2230.4,0,0.0,0.0,0,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,9.0,0.0,3,0.419325,0.0,1.0
34109,AMBULATORY,15.0,0,0.0,1,20.48,20.48,1,0,1,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,1616,0,12.0,0.0,4,0.419325,12.0,0.0
34110,WELLNESS,30.0,1,590.58,0,0.0,0.0,0,-1,0,-1,0,18,UNKNOWN,WHITE,NONHISPANIC,M,M,25,0,0.0,0.0,5,1.0,0.0,1.0


In [None]:
cols = ['condition_duration','careplan_duration','medication_duration']

for col in cols:
    data[col] = data[col].astype('Int64')

In [None]:
data['coverage_ratio']= data['coverage_ratio'].round(2)

In [None]:
data.isnull().sum()[data.isnull().sum()>0].reset_index(name = 'null_counts')

Unnamed: 0,index,null_counts


In [None]:
data['is_ongoing_condition']=np.where(data['is_ongoing_condition']== -1,'NO CONDITION',np.where(data['is_ongoing_condition']==0,'NO','YES'))

In [None]:
data['is_ongoing_careplan']=np.where(data['is_ongoing_careplan']== -1,'NO CAREPLAN',np.where(data['is_ongoing_careplan']==0,'NO','YES'))

In [None]:
data['caused_readmission']=np.where(data['caused_readmission'] == 1,'YES','NO')

In [None]:
data['procedures_per_hour']= data['procedures_per_hour'].round(2)

In [None]:
data_path

WindowsPath("C:/Users/Rano's PC/Machine/github_repo_cloned/ranojoy_data_analytics_projects/About Synthea - Why Patients Readmit More")

In [None]:
file_path = data_path / "dataset" / "processed_data" / "processed_synthea_data.csv"