In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Baseline Analysis

In [4]:
baseline = pd.read_excel("../mimic-iv-3.1/base_results_2025_03_09.xlsx")
models = pd.read_excel("../mimic-iv-3.1/modelling_results_2025_03_02.xlsx")

In [5]:
baseline.head()

Unnamed: 0,bacteria,antibiotic,accuracy,precision,recall,f1_score
0,ESCHERICHIA COLI,AMPICILLIN,0.503929,0.503929,1,0.67015
1,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,0.611413,0.611413,1,0.758853
2,ESCHERICHIA COLI,CEFAZOLIN,0.805609,0.805609,1,0.892341
3,ESCHERICHIA COLI,CEFEPIME,0.91547,0.91547,1,0.95587
4,ESCHERICHIA COLI,CEFTAZIDIME,0.921242,0.921242,1,0.959007


In [6]:
models.head()

Unnamed: 0,bacteria,antibiotic,model_name,accuracy,precision,recall,f1_score,Total Count,Actual Susceptible Count,Predicted Susceptible Count,n_estimators,max_depth,min_samples_split
0,ESCHERICHIA COLI,AMPICILLIN,rf_ESCHERICHIA COLI_AMPICILLIN,0.566563,0.521127,0.506849,0.513889,323,146,142,50,10,10
1,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,rf_ESCHERICHIA COLI_AMPICILLIN/SULBACTAM,0.595679,0.613139,0.870466,0.719486,324,193,274,200,10,2
2,ESCHERICHIA COLI,CEFAZOLIN,rf_ESCHERICHIA COLI_CEFAZOLIN,0.817901,0.815625,1.0,0.898451,324,261,320,100,5,10
3,ESCHERICHIA COLI,CEFEPIME,rf_ESCHERICHIA COLI_CEFEPIME,0.907121,0.906542,1.0,0.95098,323,291,321,100,10,2
4,ESCHERICHIA COLI,CEFTAZIDIME,rf_ESCHERICHIA COLI_CEFTAZIDIME,0.904321,0.903125,1.0,0.949097,324,289,320,100,20,5


In [7]:
analysis = baseline.drop(columns=['recall','f1_score']).merge(models[['bacteria','antibiotic','precision']],
                                                   on = ['bacteria','antibiotic'],
                                                   how = 'left',
                                                   suffixes = ('_baseline','_modeled'))

analysis['precision_delta'] = analysis['precision_modeled'] - analysis['precision_baseline']
analysis

Unnamed: 0,bacteria,antibiotic,accuracy,precision_baseline,precision_modeled,precision_delta
0,ESCHERICHIA COLI,AMPICILLIN,0.503929,0.503929,0.521127,0.017198
1,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,0.611413,0.611413,0.613139,0.001726
2,ESCHERICHIA COLI,CEFAZOLIN,0.805609,0.805609,0.815625,0.010016
3,ESCHERICHIA COLI,CEFEPIME,0.91547,0.91547,0.906542,-0.008928
4,ESCHERICHIA COLI,CEFTAZIDIME,0.921242,0.921242,0.903125,-0.018117
5,ESCHERICHIA COLI,CEFTRIAXONE,0.882644,0.882644,0.887147,0.004503
6,ESCHERICHIA COLI,CIPROFLOXACIN,0.732396,0.732396,0.698795,-0.033601
7,ESCHERICHIA COLI,GENTAMICIN,0.897404,0.897404,0.873457,-0.023947
8,ESCHERICHIA COLI,MEROPENEM,0.998653,0.998653,0.996914,-0.001739
9,ESCHERICHIA COLI,NITROFURANTOIN,0.956875,0.956875,0.946619,-0.010256


In [8]:
analysis.to_excel("../mimic-iv-3.1/baseline_analysis_25_03_09.xlsx", index=False)

### Pop

In [18]:
final_microbiology_df = pd.read_pickle("../mimic-iv-3.1/final_microbiology_df.pkl")

In [26]:
modeling_pop = final_microbiology_df.groupby(['org_name', 'ab_name'])['subject_id'].count().reset_index()
modeling_pop = modeling_pop[modeling_pop['subject_id']>150]
modeling_pop.rename(columns={'subject_id':'row_count'}, inplace=True)

In [30]:
modeling_pop[modeling_pop['org_name']=='ESCHERICHIA COLI']

Unnamed: 0,org_name,ab_name,row_count
9,ESCHERICHIA COLI,AMPICILLIN,27359
10,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,27461
11,ESCHERICHIA COLI,CEFAZOLIN,27455
12,ESCHERICHIA COLI,CEFEPIME,27375
13,ESCHERICHIA COLI,CEFTAZIDIME,27464
14,ESCHERICHIA COLI,CEFTRIAXONE,27455
15,ESCHERICHIA COLI,CIPROFLOXACIN,27466
16,ESCHERICHIA COLI,GENTAMICIN,27467
17,ESCHERICHIA COLI,MEROPENEM,27460
18,ESCHERICHIA COLI,NITROFURANTOIN,23374


In [31]:
modeling_pop[modeling_pop['org_name']=='KLEBSIELLA PNEUMONIAE']

Unnamed: 0,org_name,ab_name,row_count
24,KLEBSIELLA PNEUMONIAE,AMPICILLIN/SULBACTAM,8636
25,KLEBSIELLA PNEUMONIAE,CEFAZOLIN,8633
26,KLEBSIELLA PNEUMONIAE,CEFEPIME,8606
27,KLEBSIELLA PNEUMONIAE,CEFTAZIDIME,8636
28,KLEBSIELLA PNEUMONIAE,CEFTRIAXONE,8635
29,KLEBSIELLA PNEUMONIAE,CIPROFLOXACIN,8639
30,KLEBSIELLA PNEUMONIAE,GENTAMICIN,8642
31,KLEBSIELLA PNEUMONIAE,MEROPENEM,8639
32,KLEBSIELLA PNEUMONIAE,NITROFURANTOIN,6452
33,KLEBSIELLA PNEUMONIAE,PIPERACILLIN/TAZO,7676


In [32]:
modeling_pop[modeling_pop['org_name']=='PSEUDOMONAS AERUGINOSA']

Unnamed: 0,org_name,ab_name,row_count
40,PSEUDOMONAS AERUGINOSA,CEFEPIME,6694
41,PSEUDOMONAS AERUGINOSA,CEFTAZIDIME,6693
43,PSEUDOMONAS AERUGINOSA,CIPROFLOXACIN,6696
45,PSEUDOMONAS AERUGINOSA,GENTAMICIN,6693
46,PSEUDOMONAS AERUGINOSA,MEROPENEM,6693
48,PSEUDOMONAS AERUGINOSA,PIPERACILLIN/TAZO,6665
50,PSEUDOMONAS AERUGINOSA,TOBRAMYCIN,6692


### Imports

In [36]:
# Brian's data
# df = pd.read_csv("../mimic-iv-3.1/ecoli_targets_w_lab_features.csv.gz", compression="gzip")
df = pd.read_csv("../mimic-iv-3.1/targets_w_lab_features_v2.csv.gz", compression="gzip")

  df = pd.read_csv("../mimic-iv-3.1/targets_w_lab_features_v2.csv.gz", compression="gzip")


In [4]:
# Kat's data
omr_df = pd.read_csv("../mimic-iv-3.1/df_omr_pt_ad_diag_pr.csv")

In [5]:
print(df.shape)
print(omr_df.shape)

(504738, 108)
(35529, 928)


### Labevents missing

### Blood
Glucose - use abnormal flag
Neutrophils - use force value
Immature Granulocytes - use force value
Lactate - value
PTT - value
Monocytes - force value
Basophils - force value
Lymphocytes - force value
Eosinophils - force value
Calcium, Total - force value
Alanine Aminotransferase (ALT) - force value
Asparate Aminotransferase (AST) - force value
Magnesium - force value
Phosphate - force value
INR(PT) - force value
PT - force value
Alkaline Phosphatase - force value
Bilirubin, Total - force value
Albumin - force value
RDW-SD - force value


### Urine
Urine Color - Use abnormal flag
Urine Appearance - Use abnormal flag
Ketone - use abnormal flag
Glucose - use flag
Protein - use flag
Epithelial Cells - use flag ?
RBC- use flag ?
WBC-use flag ?


Specific Gravity - force value
pH - force value

Yeast - categories
Blood - make categories
Leukocytes - make categories
Nitrite - make categories
Bilirubin - categories

Urobilinogen - Skip dont know
Bacteria - skip


In [38]:
df.isna().sum()

subject_id                                      0
hadm_id                                    321967
org_name                                        0
ab_name                                         0
charttime                                       0
interpretation                                  0
susceptible_flag                                0
charttime_target                                0
Blood - Hematocrit                          48598
Blood - Platelet Count                      51996
Blood - MCV                                 47240
Blood - Red Blood Cells                     41891
Blood - RDW                                 41876
Blood - MCH                                 41828
Blood - MCHC                                41826
Blood - White Blood Cells                   50783
Blood - Creatinine                          50738
Blood - Urea Nitrogen                       47044
Blood - Potassium                          121961
Blood - Sodium                             110031


In [9]:
labevents_df= pd.read_csv("../mimic-iv-3.1/labevents_df_4b_wd.csv.gz", compression="gzip")

In [145]:
fluid = 'Blood'
label = 'PTT'

In [146]:
labevents_df[(labevents_df['fluid']==fluid) & (labevents_df['label']==label)]['value'].value_counts(dropna=False)

value
___                         17835
NaN                          2953
29.6                         1365
29.2                         1346
29.7                         1341
29.5                         1338
28.4                         1326
30.2                         1320
29.9                         1309
29.3                         1302
28.3                         1298
28.8                         1291
29.8                         1291
27.9                         1290
28.2                         1283
27.8                         1283
28.7                         1283
28.0                         1279
29.4                         1278
29.1                         1277
30.3                         1276
28.6                         1275
28.5                         1267
28.1                         1263
29.0                         1263
30.0                         1259
30.1                         1252
30.5                         1252
28.9                         1249
30.6    

In [147]:
labevents_df[(labevents_df['fluid']==fluid) & (labevents_df['label']==label)]['comments'].value_counts(dropna=False)

comments
NaN                                                                                              221533
___                                                                                               12052
NOTE NEW REFERENCE RANGE AS OF ___.                                                                2676
VERIFIED BY REPLICATE ANALYSIS.                                                                    2172
UNABLE TO REPORT.                                                                                  1686
VERIFIED.                                                                                           406
HEMOLYZED SPECIMEN.                                                                                 261
HEMOLYZED, MODERATELY.                                                                              147
ICTERIC SPECIMEN.                                                                                   139
VERIFIED BY REPLICATE ANALYSIS.  CHECKED FOR CLOT.     

In [148]:
labevents_df[(labevents_df['fluid']==fluid) & (labevents_df['label']==label)]['flag'].value_counts(dropna=False)

flag
NaN         128859
abnormal    113477
Name: count, dtype: int64

In [116]:
labevents_df[(labevents_df['fluid']==fluid) & (labevents_df['label']==label) & (labevents_df['flag']=='abnormal')]['value'].value_counts(dropna=False)

value
NaN     704
FEW     183
MANY    180
RARE    137
OCC     136
MOD     123
Name: count, dtype: int64

### Merge for OMR Missings

In [126]:
df = df.merge(omr_df, 
              on=["subject_id","charttime"], 
              how="left")

In [127]:
df.shape

(504738, 1034)

In [128]:
# Count NaN values in each of the new lab test columns
nan_counts = df.isna().sum()

# Display the result
print(nan_counts)
print(df.shape[0])

subject_id                                      0
hadm_id                                    321967
org_name                                        0
ab_name                                         0
charttime                                       0
interpretation                                  0
susceptible_flag                                0
charttime_target                                0
Blood - Hematocrit                          48598
Blood - Platelet Count                      51996
Blood - MCV                                 47240
Blood - Red Blood Cells                     41891
Blood - RDW                                 41876
Blood - MCH                                 41828
Blood - MCHC                                41826
Blood - White Blood Cells                   50783
Blood - Creatinine                          50738
Blood - Urea Nitrogen                       47044
Blood - Potassium                          121961
Blood - Sodium                             110031


### OMR Missing Sets

In [129]:
omr_features = [
        'anchor_age',
    'Height (Inches)',
    'Weight (Lbs)',
    'bp_systolic',
    'bp_diastolic',
    'gender',
    'insurance',
    'language',
    'marital_status',
    'race'    
]

OHE_features1 = [
    '99',
'BLD001','BLD002','BLD003','BLD004','BLD005','BLD006','BLD007','BLD008','BLD009','BLD010',
'CIR001','CIR002','CIR003','CIR004','CIR005','CIR006','CIR007','CIR008','CIR009','CIR010',
'CIR011','CIR012','CIR013','CIR014','CIR015','CIR016','CIR017','CIR018','CIR019','CIR020',
'CIR021','CIR022','CIR023','CIR024','CIR025','CIR026','CIR027','CIR028','CIR029','CIR030',
'CIR031','CIR032','CIR033','CIR034','CIR035','CIR036','CIR037','CIR038','CIR039',
'DEN001','DEN002','DEN003',
'DIG001','DIG002','DIG003','DIG004','DIG005','DIG006','DIG007','DIG008','DIG009','DIG010',
'DIG011','DIG012','DIG013','DIG014','DIG015','DIG016','DIG017','DIG018','DIG019','DIG020',
'DIG021','DIG022','DIG023','DIG024','DIG025',
'EAR001','EAR002','EAR003','EAR004','EAR005','EAR006',
'END001','END002','END003','END004','END005','END006','END007','END008','END009','END010',
'END011','END012','END013','END014','END015','END016','END017',
'EXT001','EXT002','EXT003','EXT004','EXT005','EXT006','EXT007','EXT008','EXT009','EXT010',
'EXT011','EXT012','EXT013','EXT014','EXT015','EXT016','EXT017','EXT018','EXT019','EXT020',
'EXT021','EXT022','EXT023','EXT024','EXT025','EXT026','EXT027','EXT028','EXT029','EXT030',
'EYE001','EYE002','EYE003','EYE004','EYE005','EYE006','EYE007','EYE008','EYE009','EYE010',
'EYE011','EYE012',
'FAC001','FAC002','FAC003','FAC004','FAC005','FAC006','FAC008','FAC009','FAC010',
'FAC012','FAC013','FAC014','FAC015','FAC016','FAC017','FAC019','FAC020', #FAC011 missing
'FAC022','FAC023','FAC024','FAC025','FAC026','FAC027','FAC028','FAC029','FAC030', #FAC021 missing
'GEN001','GEN002','GEN003','GEN004','GEN005','GEN006','GEN007','GEN008','GEN009','GEN010',
'GEN011','GEN012','GEN013','GEN014','GEN015','GEN016','GEN017','GEN018','GEN019','GEN020',
'GEN021','GEN022','GEN023','GEN024','GEN025','GEN026',
'INF001','INF002','INF003','INF004','INF005','INF006','INF007','INF008','INF009','INF010',
'INF011','INF012',
'INJ001','INJ002','INJ003','INJ004','INJ005','INJ006','INJ007','INJ008','INJ009','INJ010',
'INJ011','INJ012','INJ013','INJ014','INJ015','INJ016','INJ017','INJ018','INJ019','INJ020',
'INJ021','INJ022','INJ023','INJ024','INJ025','INJ026','INJ027','INJ028','INJ029','INJ030',
'INJ031','INJ032','INJ033','INJ034','INJ035','INJ036','INJ037','INJ038','INJ039','INJ040',
'INJ041','INJ042','INJ043','INJ044','INJ045','INJ046','INJ047','INJ048','INJ049','INJ050',
'INJ054','INJ055','INJ056','INJ057','INJ059','INJ060','INJ061','INJ062','INJ063','INJ064',
'INJ065','INJ066','INJ067','INJ069','INJ070','INJ071','INJ072','INJ073','INJ074','INJ075','INJ076',
'MAL001','MAL002','MAL003','MAL004','MAL005','MAL006','MAL007','MAL008','MAL009','MAL010',
'MBD001','MBD002','MBD003','MBD004','MBD005','MBD006','MBD007','MBD008','MBD009','MBD010',
'MBD011','MBD012','MBD013','MBD014','MBD017','MBD018','MBD019','MBD020','MBD021','MBD022',
'MBD023','MBD024','MBD025','MBD026','MBD027','MBD028','MBD032','MBD034',
'MUS001','MUS002','MUS003','MUS004','MUS005','MUS006','MUS007','MUS008','MUS009','MUS010',
'MUS011','MUS012','MUS013','MUS014','MUS015','MUS016','MUS017','MUS020','MUS021','MUS022',
'MUS023','MUS024','MUS025','MUS026','MUS027','MUS028','MUS029','MUS030','MUS031','MUS032',
'MUS033','MUS034','MUS035','MUS036','MUS037','MUS038',
'NEO001','NEO002','NEO003','NEO004','NEO005','NEO006','NEO007','NEO008','NEO009','NEO010',
'NEO011','NEO012','NEO013','NEO014','NEO015','NEO016','NEO017','NEO018','NEO019','NEO020',
'NEO021','NEO022','NEO023','NEO024','NEO025','NEO026','NEO027','NEO028','NEO029','NEO030',
'NEO031','NEO032','NEO033','NEO034','NEO035','NEO036','NEO037','NEO038','NEO039','NEO040',
'NEO041','NEO042','NEO043','NEO044','NEO045','NEO046','NEO047','NEO048','NEO049','NEO050',
'NEO051','NEO052','NEO054','NEO056','NEO057','NEO058','NEO059','NEO060','NEO061','NEO062',
'NEO063','NEO064','NEO065','NEO066','NEO067','NEO068','NEO069','NEO070','NEO071','NEO072',
'NEO073','NEO074','NEO075','NEO078',
'NVS001','NVS002','NVS003','NVS004','NVS005','NVS006','NVS007','NVS008','NVS009','NVS010',
'NVS011','NVS012','NVS013','NVS014','NVS015','NVS016','NVS017','NVS018','NVS019','NVS020',
'NVS021','NVS022',
'PNL004','PNL006','PNL009','PNL010','PNL013','PNL015',
'PRG002','PRG003','PRG004','PRG005','PRG006','PRG007','PRG008','PRG009','PRG010','PRG011',
'PRG012','PRG013','PRG014','PRG015','PRG016','PRG017','PRG018','PRG019','PRG020','PRG021',
'PRG022','PRG023','PRG024','PRG025','PRG026','PRG027','PRG028','PRG029','PRG030',
'RSP001','RSP002','RSP003','RSP004','RSP005','RSP006','RSP007','RSP008','RSP009','RSP010',
'RSP011','RSP012','RSP013','RSP014','RSP015','RSP016','RSP017',
'SKN001','SKN002','SKN003','SKN004','SKN005','SKN006','SKN007',
'SYM001','SYM002','SYM003','SYM004','SYM005','SYM006','SYM007','SYM008','SYM009','SYM010',
'SYM011','SYM012','SYM013','SYM014','SYM015','SYM016','SYM017','SYM018']

OHE_features2 = [
'99.1','A01AA','A01AB','A01AC','A01AD','A02AA','A02AB','A02AC','A02AF','A02BA','A02BB','A02BC',
'A02BX','A03AA','A03AB','A03BA','A03CB','A03FA','A04AA','A04AD','A05AA','A06AA','A06AB',
'A06AD','A06AG','A06AH','A06AX','A07AA','A07AC','A07BA','A07DA','A07EA','A07EB','A07EC',
'A09AA','A09AB','A10AB','A10AC','A10AD','A10AE','A10AF','A10BA','A10BB','A10BF','A10BH',
'A10BJ','A10BK','A10BX','A11CC','A12AA','A12BA','A12CA','A12CC','A12CD','A16AX',
'B01AA','B01AB','B01AC','B01AD','B01AE','B01AF','B02AA','B02BA','B02BC','B02BD','B02BX',
'B03AB','B03AC','B03BA','B03XA','B05AA','B05BA','B05BB','B05BC','B05CA','B05CB','B05CX',
'B05XA','B05XB','B05XX','B06AA','C01AA','C01BA','C01BB','C01BC','C01BD','C01CA','C01CE',
'C01DA','C01EA','C01EB','C02AC','C02CA','C02DB','C02DC','C03AA','C03BA','C03CA','C03DA',
'C03DB','C05AA','C05AD','C05AE','C05AX','C05BA','C05BB','C07AA','C07AB','C07AG','C08CA',
'C08DA','C08DB','C09AA','C09CA','C09DX','C10AA','C10AC','D01AA','D01AC','D01AE','D01BA',
'D02AA','D04AA','D04AB','D05AX','D06AX','D06BB','D06BX','D07AA','D07AB','D07AC','D07AD',
'D07XA','D07XB','D07XC','D08AE','D10AA','D10AB','D10AD','D10AE','D10AF','D11AA','D11AC',
'D11AH','D11AX','G01AA','G01AF','G02AB','G02AD','G02CB','G02CC','G03AA','G03AB','G03AC',
'G03BA','G03CA','G03DA','G03DB','G03DC','G03FA','G03FB','G03XA','G03XC','G04BA','G04BC',
'G04BD','G04BE','G04BX','G04CA','G04CB','H01CB','H02AA','H02AB','H03AA','H04AA','H05AA',
'H05BX','J01AA','J01CA','J01CE','J01CF','J01CG','J01CR','J01DB','J01DD','J01DE','J01DH',
'J01DI','J01EA','J01EC','J01EE','J01FA','J01FF','J01GB','J01MA','J01XA','J01XB','J01XD',
'J01XX','J02AA','J02AB','J02AC','J02AX','J04AB','J04AC','J04AD','J05AB','J05AC','J05AE',
'J05AF','J05AG','J05AH','J05AJ','J05AR','J05AX','J06BA','J07AH','J07AL','J07BC','J07BD',
'J07BK','L01AA','L01AC','L01AX','L01BA','L01BB','L01BC','L01CB','L01CD','L01CE','L01DB',
'L01EA','L01EC','L01EF','L01EG','L01EL','L01EX','L01FA','L01FF','L01FX','L01XA','L01XF',
'L01XK','L01XX','L02AA','L02AB','L02AE','L02BA','L02BB','L02BG','L03AB','L03AX','L04AA',
'L04AB','L04AC','L04AD','L04AF','L04AH','L04AK','L04AX','M01AB','M01AC','M01AE','M01AH',
'M01CC','M02AA','M02AB','M03AB','M03AC','M03AX','M03BX','M04AA','M04AB','M05BA','N01AH',
'N01AX','N01BA','N01BB','N01BX','N02AA','N02AB','N02AE','N02AF','N02AJ','N02BA','N02BE',
'N02BF','N02CA','N02CC','N02CX','N03AA','N03AB','N03AF','N03AG','N03AX','N04AC','N04BA',
'N04BB','N04BC','N04BX','N05AA','N05AB','N05AD','N05AE','N05AH','N05AN','N05AX','N05BA',
'N05BB','N05CD','N05CF','N05CH','N05CM','N06AA','N06AB','N06AF','N06AX','N06BA','N06BC',
'N06DA','N06DX','N07AA','N07AB','N07AX','N07BA','N07BB','N07BC','P01AB','P01AX','P01BA',
'P01BB','P02CF','P03AC','R01AA','R01AB','R01AC','R01AD','R01AX','R01BA','R02AA','R02AB',
'R02AD','R02AX','R03AA','R03AC','R03AK','R03AL','R03BA','R03BB','R03BC','R03CC','R03DC',
'R03DX','R05CA','R05CB','R05DA','R06AA','R06AB','R06AD','R06AE','S01AA','S01AD','S01AE',
'S01BA','S01BC','S01CB','S01EA','S01EB','S01EC','S01EE','S01FA','S01FB','S01GA','S01GX',
'S01HA','S01XA','S02AA','S02BA','S02DA','S03AA','S03BA','V03AB','V03AC','V03AE','V03AF',
'V04CA','V04CC','V04CG','V04CX','V06DC','V07AB'
]

In [135]:
base_columns = ['subject_id',
                'hadm_id',
                'org_name',
                'ab_name',
                'charttime',
                'interpretation',
                'susceptible_flag',
                'charttime_target'
               ]

df_OHE1 = df[base_columns+OHE_features1]
df_OHE1 = df_OHE1[df_OHE1[OHE_features1].isna().any(axis=1)]
df_OHE1.isna().sum()

subject_id               0
hadm_id             107389
org_name                 0
ab_name                  0
charttime                0
interpretation           0
susceptible_flag         0
charttime_target         0
99                  175269
BLD001              175269
BLD002              175269
BLD003              175269
BLD004              175269
BLD005              175269
BLD006              175269
BLD007              175269
BLD008              175269
BLD009              175269
BLD010              175269
CIR001              175269
CIR002              175269
CIR003              175269
CIR004              175269
CIR005              175269
CIR006              175269
CIR007              175269
CIR008              175269
CIR009              175269
CIR010              175269
CIR011              175269
CIR012              175269
CIR013              175269
CIR014              175269
CIR015              175269
CIR016              175269
CIR017              175269
CIR018              175269
C

In [134]:
df_OHE2 = df[base_columns+OHE_features2]
df_OHE2 = df_OHE2[df_OHE2[OHE_features2].isna().any(axis=1)]
df_OHE2.isna().sum()

subject_id               0
hadm_id              97473
org_name                 0
ab_name                  0
charttime                0
interpretation           0
susceptible_flag         0
charttime_target         0
99.1                110706
A01AA               110706
A01AB               110706
A01AC               110706
A01AD               110706
A02AA               110706
A02AB               110706
A02AC               110706
A02AF               110706
A02BA               110706
A02BB               110706
A02BC               110706
A02BX               110706
A03AA               110706
A03AB               110706
A03BA               110706
A03CB               110706
A03FA               110706
A04AA               110706
A04AD               110706
A05AA               110706
A06AA               110706
A06AB               110706
A06AD               110706
A06AG               110706
A06AH               110706
A06AX               110706
A07AA               110706
A07AC               110706
A

In [136]:
df_OHE3 = df[base_columns+omr_features]
df_OHE3 = df_OHE3[df_OHE3[omr_features].isna().any(axis=1)]
df_OHE3.isna().sum()

subject_id               0
hadm_id             148382
org_name                 0
ab_name                  0
charttime                0
interpretation           0
susceptible_flag         0
charttime_target         0
anchor_age          110706
Height (Inches)     110732
Weight (Lbs)        110732
bp_systolic         110736
bp_diastolic        110736
gender              110706
insurance           160960
language            159304
marital_status      168273
race                158824
dtype: int64

In [137]:
df_OHE1.to_csv("../mimic-iv-3.1/OMR_missing_1.csv.gz", compression="gzip", index=False)
df_OHE2.to_csv("../mimic-iv-3.1/OMR_missing_2.csv.gz", compression="gzip", index=False)
df_OHE3.to_csv("../mimic-iv-3.1/OMR_missing_3.csv.gz", compression="gzip", index=False)