In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import gc
from tqdm import tqdm

## Bases

In [2]:
BASES_HOSP = "/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/hosp/"
BASES_ICU = "/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/icu/"
labexams = "labevents.csv"
chartexams = "chartevents.csv"
omr = "omr.csv"
microbiologyevents = "microbiologyevents.csv"
pharmacy = "pharmacy.csv"
emar = "emar.csv"
prescriptions = "prescriptions.csv"
procedures = "procedures_icd.csv"
d_labitems = "d_labitems.csv"
d_items = "d_items.csv"
d_procedures= "d_icd_procedures.csv"

path_lab = BASES_HOSP + labexams
path_chart = BASES_ICU + chartexams
path_ids_labs = BASES_HOSP + d_labitems
path_ids_items = BASES_ICU + d_items
path_omr = BASES_HOSP + omr
path_microbiology = BASES_HOSP + microbiologyevents
path_pharmacy = BASES_HOSP + pharmacy
path_emar = BASES_HOSP + emar
path_prescriptions = BASES_HOSP + prescriptions
path_procedures = BASES_HOSP + procedures
path_d_procedures = BASES_HOSP + d_procedures


In [3]:
def create_summary_table(data: pd.DataFrame, include_ref_range: bool) -> pd.DataFrame:
    # Definir as agregações básicas
    aggregations = {
        'itemid': 'count',
        'valuenum': [ 'max', 'min'],
        'valueuom': 'first',
    }
    
    # Adicionar as colunas ref_range_lower e ref_range_upper se necessário
    if include_ref_range:
        aggregations['ref_range_lower'] = 'first'
        aggregations['ref_range_upper'] = 'first'

    summary_table = data.groupby('itemid').agg(aggregations).reset_index()

    # Renomear as colunas para facilitar a leitura
    col_names = ['itemid', 'count', 'max_valuenum', 'min_valuenum', 'valueuom']
    if include_ref_range:
        col_names += ['ref_range_lower', 'ref_range_upper']
    summary_table.columns = col_names

    return summary_table


In [3]:
# procedures = pd.read_csv(path_procedures)
prescriptions = pd.read_csv(path_prescriptions)
# pharmacy = pd.read_csv(path_pharmacy)
# emar = pd.read_csv(path_emar)
# microbiology = pd.read_csv(path_microbiology)
# omr = pd.read_csv(path_omr)
# d_procedures = pd.read_csv(path_d_procedures)


  prescriptions = pd.read_csv(path_prescriptions)


## Prescriptions

In [4]:
prescriptions

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
0,10000032,22595853,11700683,10000032-34,34.0,P76JEQ,2180-05-07 01:00:00,2180-05-07 22:00:00,MAIN,Acetaminophen,...,004490,9.041989e+08,500mg Tablet,,500,mg,1,TAB,,PO/NG
1,10000032,22595853,14779570,10000032-22,22.0,P76JEQ,2180-05-07 00:00:00,2180-05-07 22:00:00,MAIN,Sodium Chloride 0.9% Flush,...,,0.000000e+00,10 mL Syringe,,3,mL,0.3,SYR,3.0,IV
2,10000032,22595853,19796602,10000032-50,50.0,P260SK,2180-05-08 08:00:00,2180-05-07 22:00:00,MAIN,Furosemide,...,008209,5.107901e+10,40mg Tablet,,40,mg,1,TAB,1.0,PO/NG
3,10000032,22595853,20256254,10000032-32,32.0,P76JEQ,2180-05-07 01:00:00,2180-05-07 22:00:00,MAIN,Raltegravir,...,063231,6.022761e+06,400 mg Tablet,,400,mg,1,TAB,2.0,PO
4,10000032,22595853,28781051,10000032-27,27.0,P76JEQ,2180-05-07 00:00:00,2180-05-07 22:00:00,MAIN,Heparin,...,006549,6.332303e+10,5000 Units / mL- 1mL Vial,,5000,UNIT,1,mL,3.0,SC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15416703,19999987,23865745,92839339,19999987-25,25.0,P5542X,2145-11-03 00:00:00,2145-11-03 18:00:00,MAIN,Influenza Virus Vaccine,...,69637.0,3.333200e+10,0.5 mL Syringe,,0.5,mL,1,SYR,0.0,IM
15416704,19999987,23865745,92913309,19999987-150,150.0,P63OX3,2145-11-04 10:00:00,2145-11-11 17:00:00,MAIN,Levothyroxine Sodium,...,6649.0,7.445521e+07,50mcg Tablet,,50,mcg,1,TAB,1.0,PO/NG
15416705,19999987,23865745,95753195,19999987-219,219.0,P77Z1Y,2145-11-10 10:00:00,2145-11-11 17:00:00,MAIN,Duloxetine,...,57892.0,2.324033e+06,30mg Capsule,,60,mg,2,CAP,1.0,PO
15416706,19999987,23865745,96343043,19999987-206,206.0,P77Z1Y,2145-11-09 10:00:00,2145-11-09 16:00:00,MAIN,Venlafaxine XR,...,46405.0,6.808405e+10,150mg XR Capsule,,150,mg,1,CAP,1.0,PO


In [11]:
len(prescriptions[prescriptions['route'].notna()]['route'])

15412808

In [10]:
prescriptions['route'].notna()

0           True
1           True
2           True
3           True
4           True
            ... 
15416703    True
15416704    True
15416705    True
15416706    True
15416707    True
Name: route, Length: 15416708, dtype: bool

In [10]:
summary_prescriptions = prescriptions.groupby('drug').agg({
        'subject_id': 'count',
        'dose_unit_rx': 'first',
        'ndc': 'first',
    }).reset_index()
summary_prescriptions.columns = ['Nome','Quantidade','Unidade','NomeFormal']
summary_prescriptions

Unnamed: 0,Nome,Quantidade,Unidade,NomeFormal
0,Adcirca (tadalafil),1,mg,
1,Autologus Serum Eye Drops 20%,1,DROP,
2,CoQ10 (300mg),2,mg\ 0 mg,1.465408e+10
3,Endocet *NF* (oxyCODONE-acetaminophen) 1/2 TAB,1,TAB,
4,Enzalutamide (Xtandi),1,mg,
...,...,...,...,...
9608,zypre,1,mg,2.445385e+06
9609,zyrtec-D,1,mg,
9610,zz,4,mcg,0.000000e+00
9611,zzVoxtalisibzz,1,mg,0.000000e+00


## Procedures

In [None]:
procedures = procedures.merge(d_procedures,on='icd_code')
summary_procedures = procedures.groupby('long_title')['subject_id'].count().reset_index()
summary_procedures.columns = ['Nome', 'Quantidade']
summary_procedures

## Pharmacy

In [None]:
summary_pharmacy = pharmacy.groupby('medication')['subject_id'].count().reset_index()
summary_pharmacy.columns = ['Nome', 'Quantidade']
summary_pharmacy

## Emar

In [33]:
summary_emar = emar.groupby('medication')['subject_id'].count().reset_index()
summary_emar.columns = ['Nome', 'Quantidade']
summary_emar

Unnamed: 0,Nome,Quantidade
0,CoQ10 (300mg) or placebo,13
1,INV-ABL001,22
2,"Niacinamide 1g, 3g, or placebo",1
3,ketamine 5%/ lidocaine 5% / amit 2%,12
4,*NF* Carbidopa-Levodopa (25-100),25
...,...,...
4181,zinc picolinate (bulk),4
4182,zinc sulfate,10
4183,zoledronic acid,5
4184,zolpidem,1


## Microbiology

In [16]:
summary_microbiology = microbiology.groupby('test_name').agg({
        'subject_id': 'count',
        'spec_type_desc': 'first'
    }).reset_index()
summary_microbiology.columns = ['Nome','Quantidade','Tipo']
summary_microbiology

Unnamed: 0,Nome,Quantidade,Tipo
0,ACID FAST CULTURE,29356,PERITONEAL FLUID
1,ACID FAST SMEAR,27142,PERITONEAL FLUID
2,ADDITIONAL CELLS COUNTED,101,BONE MARROW - CYTOGENETICS
3,AEROBIC BOTTLE,10174,BLOOD CULTURE
4,ANAEROBIC BOTTLE,9759,BLOOD CULTURE
...,...,...,...
166,VIRAL CULTURE: R/O CYTOMEGALOVIRUS,2213,BRONCHOALVEOLAR LAVAGE
167,VIRAL CULTURE: R/O HERPES SIMPLEX VIRUS,4526,SKIN SCRAPINGS
168,WOUND CULTURE,165671,SWAB
169,YEAST VAGINITIS CULTURE,19424,SWAB


## Omr

In [41]:
summary_omr = omr.groupby('result_name')['subject_id'].count().reset_index()
summary_omr.columns = ['Nome', 'Quantidade']
summary_omr

Unnamed: 0,Nome,Quantidade
0,BMI,554
1,BMI (kg/m2),1662112
2,Blood Pressure,2169549
3,Blood Pressure Lying,2764
4,Blood Pressure Sitting,3400
5,Blood Pressure Standing,523
6,Blood Pressure Standing (1 min),2560
7,Blood Pressure Standing (3 mins),626
8,Height,39
9,Height (Inches),706906


## Exams

In [3]:
labexams = pd.read_csv(path_lab,usecols=['subject_id','charttime','itemid','valuenum','valueuom','ref_range_lower','ref_range_upper'])
# chartexams = pd.read_csv(path_chart,usecols=['itemid','valuenum','valueuom'])


In [3]:
ids_labs = pd.read_csv(path_ids_labs)
ids_items = pd.read_csv(path_ids_items)

In [14]:
teste = labexams[(labexams['itemid']==50912) | (labexams['itemid']==52546)]
teste[teste.duplicated(subset=['charttime','subject_id'], keep=False)].sort_values(by = ['subject_id','charttime']).to_csv('teste',index=False)

In [6]:
summary_lab = create_summary_table(labexams,True)
# summary_chart = create_summary_table(chartexams,False)


In [8]:
summary_lab = summary_lab.merge(ids_labs, on='itemid') 
# summary_lab = summary_lab.drop('itemid', axis=1)
# summary_chart = summary_chart.merge(ids_items, on='itemid') 
# summary_chart = summary_chart.drop('itemid', axis=1)


In [None]:
teste = labexams.dropna(subset=['valuenum'])

Unnamed: 0,subject_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper
0,10000032,51237,2180-03-23 11:51:00,1.40,,0.9,1.1
1,10000032,51274,2180-03-23 11:51:00,15.10,sec,9.4,12.5
2,10000032,50853,2180-03-23 11:51:00,15.00,ng/mL,30.0,60.0
3,10000032,50861,2180-03-23 11:51:00,102.00,IU/L,0.0,40.0
4,10000032,50862,2180-03-23 11:51:00,3.30,g/dL,3.5,5.2
...,...,...,...,...,...,...,...
118171362,19999987,51279,2145-11-09 05:30:00,3.52,m/uL,4.2,5.4
118171363,19999987,51301,2145-11-09 05:30:00,5.70,K/uL,4.0,11.0
118171364,19999987,50912,2146-02-07 11:13:00,1.10,mg/dL,0.4,1.1
118171365,19999987,50920,2146-02-07 11:13:00,,,,


In [16]:
teste = labexams.dropna(subset=['valuenum'])
teste

Unnamed: 0,subject_id,itemid,charttime,valuenum,valueuom,ref_range_lower,ref_range_upper
0,10000032,51237,2180-03-23 11:51:00,1.40,,0.9,1.1
1,10000032,51274,2180-03-23 11:51:00,15.10,sec,9.4,12.5
2,10000032,50853,2180-03-23 11:51:00,15.00,ng/mL,30.0,60.0
3,10000032,50861,2180-03-23 11:51:00,102.00,IU/L,0.0,40.0
4,10000032,50862,2180-03-23 11:51:00,3.30,g/dL,3.5,5.2
...,...,...,...,...,...,...,...
118171361,19999987,51277,2145-11-09 05:30:00,15.40,%,10.5,15.5
118171362,19999987,51279,2145-11-09 05:30:00,3.52,m/uL,4.2,5.4
118171363,19999987,51301,2145-11-09 05:30:00,5.70,K/uL,4.0,11.0
118171364,19999987,50912,2146-02-07 11:13:00,1.10,mg/dL,0.4,1.1


## Save

In [11]:
# summary_lab.to_csv('summary/summary_lab.csv',index = False)
# summary_chart.to_csv('summary/summary_chart.csv',index = False)
# summary_procedures.to_csv('summary/summary_procedures.csv',index = False,sep = ';')
summary_prescriptions.to_csv('summary/summary_prescriptions_.csv',index = False,sep = ';')
# summary_microbiology.to_csv('summary/summary_microbiology.csv',index = False,sep = ';')
# summary_omr.to_csv('summary/summary_omr.csv',index = False)