In [None]:
!pip install polars

In [1]:
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMClassifier, Dataset, cv, train, early_stopping
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import __version__ as sklearn_version
from packaging import version
from multiprocessing import cpu_count
import numpy as np
import shap
import matplotlib.pyplot as plt
import polars as pl
import itertools
import seaborn as sns

# Feature preprocessing 

In [None]:
### data preparation

try:
    comorbidities = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_comorbidity.csv')
except FileNotFoundError:
    print("Incorrect path to comorbidities file or comorbidities dataframe manipulation section not run first.")
try:
    adi_scores = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_adi_scores.csv')
except FileNotFoundError:
    print("Incorrect path to ADI scores file.")
try:
    labs = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_labs.csv')
except FileNotFoundError:
    print("Incorrect path to labs file.")
try:
    vitals = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_vitals.csv')
except FileNotFoundError:
    print("Incorrect path to vitals file.")
try:
    prior_infecting_organisms = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_culture_prior_infecting_organism.csv')
except FileNotFoundError:
    print("Incorrect path to prior infecting organisms file.")
try:
    subtype_exposure = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_antibiotic_subtype_exposure.csv')
except FileNotFoundError:
    print("Incorrect path to subtype exposure file.")
try:
    demographics = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_demographics.csv')
except FileNotFoundError:
    print("Incorrect path to demographics file.")

try:
    prior_med = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_prior_med.csv')
except FileNotFoundError:
    print("Incorrect path to prior medications file.")
try:
    prior_procedures = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_priorprocedures.csv')
except FileNotFoundError:
    print("Incorrect path to prior procedures file.")
try:
    ward_info = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_ward_info.csv')
except FileNotFoundError:
    print("Incorrect path to ward info file.")
try:
    microbial_resistance = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_microbial_resistance.csv')
except FileNotFoundError:
    print("Incorrect path to microbial resistance file.")
try:
    cohort = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_cohort.csv')
except FileNotFoundError:
    print("Incorrect path to cohort file.")
try:
    antibiotic_class_exposure = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_antibiotic_class_exposure.csv')
except FileNotFoundError:
    print("Incorrect path to antibiotic class exposure file.")
try:
    implied_susceptibility = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_implied_susceptibility.csv')
except FileNotFoundError:
   print("Incorrect path to implied susceptibility file.")
try:
    nursing_home_visits = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_nursing_home_visits.csv')
except FileNotFoundError:
    print("Incorrect path to nursing home visits file.")


# only in patients 

In [7]:
cohort = cohort[cohort['was_positive'] == 1]
cohort = cohort[cohort['ordering_mode'] == 'Inpatient']
cohort = cohort.drop(columns = ['ordering_mode', 'was_positive'])
cohort['year'] = [time[:4] for time in cohort['order_time_jittered_utc']]
cohort=cohort[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','order_time_jittered_utc','culture_description','year']]
cohort.drop_duplicates(inplace=True)

In [8]:
implied_susceptibility=implied_susceptibility[(implied_susceptibility.organism!='Null')&(implied_susceptibility.antibiotic!='Null')]
implied_susceptibility.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,organism,antibiotic,susceptibility,implied_susceptibility
0,JC2673551,131337997983,831009441,ENTEROBACTER CLOACAE COMPLEX,imipenem,Null,Susceptible
1,JC616267,131007285560,358749950,ENTEROBACTER CLOACAE,imipenem,Null,Null
2,JC517471,131024758159,420255585,ENTEROBACTER ASBURIAE,doripenem,Null,Null
3,JC1931399,131200817434,512398445,MYCOBACTERIUM AVIUM COMPLEX,Clarithromycin,Susceptible,Null
4,JC605042,131280233261,642349435,KLEBSIELLA OXYTOCA,Ceftolozane/Tazobactam,Susceptible,Null


In [None]:

implied_susceptibility=implied_susceptibility[(implied_susceptibility.susceptibility!='Null')|(implied_susceptibility.implied_susceptibility!='Null')]


In [10]:
implied_susceptibility.loc[
    implied_susceptibility['implied_susceptibility'] == 'Null', 
    'implied_susceptibility'
] = implied_susceptibility['susceptibility']
implied_susceptibility=implied_susceptibility[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','organism','antibiotic','implied_susceptibility']]
implied_susceptibility=implied_susceptibility[implied_susceptibility.implied_susceptibility.isin(['Susceptible', 'Resistant','Non Susceptible'])]
implied_susceptibility.loc[implied_susceptibility.implied_susceptibility=='Non Susceptible',
                           'implied_susceptibility'
                           ]='Resistant'
implied_susceptibility.implied_susceptibility.unique()

array(['Susceptible', 'Resistant'], dtype=object)

In [11]:
df = cohort.merge(implied_susceptibility, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df = df[df['implied_susceptibility'].isin(['Susceptible', 'Resistant'])]
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant


In [12]:
df = df.merge(demographics, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')

df.loc[df.age=='Null','age']=None
df.loc[df.gender=='Null','gender']=None
df.loc[df.gender=='0','gender']=0
df.loc[df.gender=='1','gender']=1

df.loc[df.age=='18–24 years','age']=1
df.loc[df.age=='25–34 years','age']=2
df.loc[df.age=='35–44 years','age']=3
df.loc[df.age=='45-54 years','age']=4
df.loc[df.age=='55-64 years','age']=5
df.loc[df.age=='65-74 years','age']=6
df.loc[df.age=='75-84 years','age']=7
df.loc[df.age=='85-89 years','age']=8
df.loc[df.age=='above 90','age']=9
df['age']=df['age'].astype(int)
df.age.unique()

array([7, 3, 6, 5, 4, 9, 8, 2, 1])

In [13]:
ward_info=ward_info[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','hosp_ward_IP','hosp_ward_OP','hosp_ward_ER','hosp_ward_ICU']]
ward_info.drop_duplicates(inplace=True)
for col in ward_info.columns.values:
    ward_info.loc[ward_info[col]=='Null',col]=None
ward_info.hosp_ward_IP.unique()

array([0., 1.])

In [None]:
adi_scores=adi_scores[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','adi_score']]
adi_scores.drop_duplicates(inplace=True)
for col in adi_scores.columns.values:
    adi_scores.loc[adi_scores[col]=='Null',col]=None

In [15]:
df = df.merge(ward_info, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df = df.merge(adi_scores, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,gender,hosp_ward_IP,hosp_ward_OP,hosp_ward_ER,hosp_ward_ICU,adi_score
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,0,1.0,0.0,1.0,0.0,2
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,0,1.0,0.0,1.0,0.0,2
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,0,1.0,0.0,1.0,0.0,2
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,0,1.0,0.0,1.0,0.0,2
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,0,1.0,0.0,1.0,0.0,2


In [16]:
for col in vitals.columns.values:
    if not col in ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']:
        vitals.loc[vitals[col]=='Null',col]=None
        vitals[col]=vitals[col].astype('float')
vitals.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,Q25_heartrate,Q75_heartrate,median_heartrate,Q25_resprate,Q75_resprate,median_resprate,Q25_temp,...,first_diasbp,last_diasbp,last_sysbp,first_sysbp,last_temp,first_temp,last_resprate,first_resprate,last_heartrate,first_heartrate
0,JC2361817,131308278530,718222807,86.0,86.0,86.0,,,,,...,,80.0,109.0,,,,,,,86.0
1,JC2219930,131318072067,748261410,,,,,,,,...,75.0,75.0,121.0,121.0,,,,,,
2,JC2455425,131256420868,574712812,70.0,70.0,70.0,,,,,...,69.0,,,106.0,,,,,70.0,
3,JC1826078,131213257732,514463599,,,,,,,,...,90.0,90.0,120.0,120.0,,,,,,
4,JC1541734,131021676805,410407692,39.0,78.0,73.0,,,,,...,,42.0,62.0,,,,,,,80.0


In [17]:
df = df.merge(vitals, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')

In [18]:
nursing_home_visits=nursing_home_visits[nursing_home_visits.nursing_home_visit_culture>0]
nursing_home_visits['nursing_home_visits_within6month']=nursing_home_visits['nursing_home_visit_culture'].apply(lambda x: 1 if  x <= 180 else 0)
nursing_home_visits=nursing_home_visits[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','nursing_home_visits_within6month']]
nursing_home_visits = nursing_home_visits.groupby(['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']).agg(
    nursing_visits_within_6mo =('nursing_home_visits_within6month', sum),
).reset_index()
nursing_home_visits['nursing_visits_within_6mo']=nursing_home_visits['nursing_visits_within_6mo'].astype(int)

df = df.merge(nursing_home_visits, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.head()

  nursing_home_visits = nursing_home_visits.groupby(['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']).agg(


Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,...,last_diasbp,last_sysbp,first_sysbp,last_temp,first_temp,last_resprate,first_resprate,last_heartrate,first_heartrate,nursing_visits_within_6mo
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,...,61.0,121.0,,,,,18.0,,,
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,...,61.0,121.0,,,,,18.0,,,
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,...,61.0,121.0,,,,,18.0,,,
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,...,61.0,121.0,,,,,18.0,,,
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,...,61.0,121.0,,,,,18.0,,,


In [None]:
prior_procedures=prior_procedures[(prior_procedures.procedure_description!='Null')&(prior_procedures.procedure_description.notna())]
prior_procedures=prior_procedures[(prior_procedures.procedure_time_to_culturetime.notna())&(prior_procedures.procedure_time_to_culturetime!='Null')]
prior_procedures=prior_procedures[(prior_procedures.procedure_time_to_culturetime>0)&(prior_procedures.procedure_time_to_culturetime<=180)]
procedures = prior_procedures.procedure_description.unique()

for procedure in procedures:
    prior_procedures[f"{procedure}_within_6mo"] = prior_procedures[prior_procedures['procedure_description'] == procedure]['procedure_time_to_culturetime'].apply(lambda x: 1 if x <= 180 else 0)
    

columns_to_sum = ['urethral_catheter_within_6mo','surgical_procedure_within_6mo','mechvent_within_6mo','cvc_within_6mo','parenteral_nutrition_within_6mo','dialysis_within_6mo']
agg_dict = {col: (col, 'sum') for col in columns_to_sum}
prior_procedures = prior_procedures.groupby(['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']).agg(**agg_dict).reset_index() 
prior_procedures=prior_procedures[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','urethral_catheter_within_6mo','surgical_procedure_within_6mo','mechvent_within_6mo','cvc_within_6mo','parenteral_nutrition_within_6mo','dialysis_within_6mo']]
prior_procedures.drop_duplicates(inplace=True)
for col in ['urethral_catheter_within_6mo','surgical_procedure_within_6mo','mechvent_within_6mo','cvc_within_6mo','parenteral_nutrition_within_6mo','dialysis_within_6mo']:
    prior_procedures[col]=prior_procedures[col].astype(int)

In [20]:
df = df.merge(prior_procedures, on =['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:,42:]=df.iloc[:,42:].fillna(0)
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,...,first_resprate,last_heartrate,first_heartrate,nursing_visits_within_6mo,urethral_catheter_within_6mo,surgical_procedure_within_6mo,mechvent_within_6mo,cvc_within_6mo,parenteral_nutrition_within_6mo,dialysis_within_6mo
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,...,18.0,,,,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
microbial_resistance=microbial_resistance[(microbial_resistance.organism!='Null')&(microbial_resistance.organism.notna())]
microbial_resistance=microbial_resistance[(microbial_resistance.antibiotic!='Null')&(microbial_resistance.antibiotic.notna())]
microbial_resistance=microbial_resistance[(microbial_resistance.resistant_time_to_culturetime>0)&(microbial_resistance.resistant_time_to_culturetime<=180)]
microbial_resistance=microbial_resistance[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','organism','antibiotic']]


# filter for top 10 most common microbes
microbial_resistance = microbial_resistance[microbial_resistance['organism'].isin(['ESCHERICHIA COLI', 'PSEUDOMONAS AERUGINOSA', 'MUCOID PSEUDOMONAS AERUGINOSA', 
                                                                                   'KLEBSIELLA PNEUMONIAE', 'ACHROMOBACTER XYLOSOXIDANS', 'STAPHYLOCOCCUS AUREUS', 
                                                                                   'PSEUDOMONAS AERUGINOSA (NON-MUCOID CF)', 'ENTEROCOCCUS SPECIES', 'ENTEROBACTER CLOACAE COMPLEX', 'PROTEUS MIRABILIS'])]

microbial_resistance.drop_duplicates(inplace=True)
microbial_resistance['organism_resistance_antibiotic'] = microbial_resistance['organism'] + '_' + microbial_resistance['antibiotic']
microbial_resistance=microbial_resistance[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','organism_resistance_antibiotic']]
microbial_resistance.drop_duplicates(inplace=True)

In [None]:
microbial_resistance = microbial_resistance.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "organism_resistance_antibiotic"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
microbial_resistance.columns.name = None 

In [23]:
df = df.merge(microbial_resistance, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 48:] = df.iloc[:, 48:].fillna(0)
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,...,STAPHYLOCOCCUS AUREUS_Cefazolin,STAPHYLOCOCCUS AUREUS_Ciprofloxacin,STAPHYLOCOCCUS AUREUS_Erythromycin,STAPHYLOCOCCUS AUREUS_Gentamicin,STAPHYLOCOCCUS AUREUS_Levofloxacin,STAPHYLOCOCCUS AUREUS_Linezolid,STAPHYLOCOCCUS AUREUS_Moxifloxacin,STAPHYLOCOCCUS AUREUS_Penicillin,STAPHYLOCOCCUS AUREUS_Rifampin,STAPHYLOCOCCUS AUREUS_Vancomycin
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
prior_infecting_organisms=prior_infecting_organisms[(prior_infecting_organisms.prior_infecting_organism_days_to_culutre<=180)&
                                                    (prior_infecting_organisms.prior_infecting_organism_days_to_culutre>0)]

prior_infecting_organisms=prior_infecting_organisms[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','prior_organism']]
prior_infecting_organisms.drop_duplicates(inplace=True)

In [None]:
prior_infecting_organisms = prior_infecting_organisms.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "prior_organism"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
prior_infecting_organisms.columns.name = None  
prior_infecting_organisms.columns = [f"prior_infected_{col}" if 3 <= i <= 19 else col for i, col in enumerate(prior_infecting_organisms.columns)]
prior_infecting_organisms = prior_infecting_organisms[prior_infecting_organisms.iloc[:, 3:20].gt(0).any(axis=1)]

In [26]:
df = df.merge(prior_infecting_organisms, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 79:] = df.iloc[:, 79:].fillna(0)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,...,prior_infected_Escherichia,prior_infected_Klebsiella,prior_infected_Morganella,prior_infected_Proteus,prior_infected_Providencia,prior_infected_Pseudomonas,prior_infected_Serratia,prior_infected_Staphylococcus,prior_infected_Stenotrophomonas,prior_infected_Streptococcus
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
prior_med=prior_med[(prior_med.medication_time_to_culturetime>0)&(prior_med.medication_time_to_culturetime<=180)]
prior_med=prior_med[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','medication_name']]
prior_med.drop_duplicates(inplace=True)

In [None]:
prior_med = prior_med.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "medication_name"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
prior_med.columns.name = None 
prior_med.columns = [f"prior_med_{col}" if 3 <= i <= 23 else col for i, col in enumerate(prior_med.columns)]
prior_med = prior_med[prior_med.iloc[:, 3:23].gt(0).any(axis=1)]

In [32]:
df = df.merge(prior_med, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'],how='left')
df.iloc[:, 95:] = df.iloc[:, 95:].fillna(0)
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,...,prior_med_Colistin,prior_med_Ertapenem,prior_med_Gentamicin,prior_med_Isoniazid,prior_med_Levaquin,prior_med_Levofloxacin,prior_med_Linezolid,prior_med_Metronidazole,prior_med_Penicillin,prior_med_Vancomycin
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
antibiotic_class_exposure=antibiotic_class_exposure[(antibiotic_class_exposure.time_to_culturetime>0)&(antibiotic_class_exposure.time_to_culturetime<=180)]
antibiotic_class_exposure=antibiotic_class_exposure[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','antibiotic_class']]
antibiotic_class_exposure.drop_duplicates(inplace=True)

In [None]:
antibiotic_class_exposure = antibiotic_class_exposure.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "antibiotic_class"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
antibiotic_class_exposure.columns.name = None  
antibiotic_class_exposure.columns = [f"prior_abx_class_{col}" if 3 <= i else col for i, col in enumerate(antibiotic_class_exposure.columns)]

In [35]:
df = df.merge(antibiotic_class_exposure, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 115:] = df.iloc[:, 115:].fillna(0)
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,culture_description,year,organism,antibiotic,implied_susceptibility,age,...,prior_abx_class_Glycopeptide,prior_abx_class_Macrolide Lincosamide,prior_abx_class_Monobactam,prior_abx_class_Nitrofuran,prior_abx_class_Nitroimidazole,prior_abx_class_Oxazolidinone,"prior_abx_class_Polymyxin, Lipopeptide",prior_abx_class_Sulfonamide,prior_abx_class_Tetracycline,prior_abx_class_Urinary Antiseptic
0,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Amikacin,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Cefepime,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ertapenem,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Meropenem,Susceptible,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,JC2744063,131368600230,928257722,2023-12-23 22:29:00+00:00,URINE,2023,KLEBSIELLA PNEUMONIAE,Ampicillin,Resistant,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df = df.merge(labs, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df.head()

In [36]:
df.to_csv('Model1_AllABX_fixedTime0.csv',index=False)

In [None]:
comorbidities=comorbidities[((comorbidities.comorbidity_component_start_days_culture<=180) & (comorbidities.comorbidity_component_start_days_culture>=0)) &
                            ((comorbidities.comorbidity_component_end_days_culture<0) | (comorbidities.comorbidity_component_end_days_culture.isna()))]


comorbidities=comorbidities[(comorbidities.comorbidity_component!='Null')&(comorbidities.comorbidity_component.notna())]
comorbidities=comorbidities[["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "comorbidity_component"]]
comorbidities.drop_duplicates(inplace=True)

comorbidities = comorbidities.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "comorbidity_component"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
comorbidities.columns.name = None 
print(comorbidities)

In [None]:
df = df.merge(comorbidities, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 189:] = df.iloc[:, 189:].fillna(0)


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.to_csv('Model1_with_Comorbidity_components.csv',index=False)