In [None]:
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMClassifier, Dataset, cv, train, early_stopping
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import __version__ as sklearn_version
from packaging import version
from multiprocessing import cpu_count
import numpy as np
import shap
import matplotlib.pyplot as plt
import polars as pl
import itertools
import seaborn as sns

# Feature preprocessing 

In [None]:
### data preparation

try:
    comorbidities = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_comorbidity.csv')
except FileNotFoundError:
    print("Incorrect path to comorbidities file or comorbidities dataframe manipulation section not run first.")
"""
try:
    adi_scores = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_adi_scores.csv')
except FileNotFoundError:
    print("Incorrect path to ADI scores file.")
try:
    labs = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_labs.csv')
except FileNotFoundError:
    print("Incorrect path to labs file.")
try:
    vitals = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_vitals.csv')
except FileNotFoundError:
    print("Incorrect path to vitals file.")
try:
    prior_infecting_organisms = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_culture_prior_infecting_organism.csv')
except FileNotFoundError:
    print("Incorrect path to prior infecting organisms file.")
try:
    subtype_exposure = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_antibiotic_subtype_exposure.csv')
except FileNotFoundError:
    print("Incorrect path to subtype exposure file.")
try:
    demographics = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_demographics.csv')
except FileNotFoundError:
    print("Incorrect path to demographics file.")
try:
    prior_med = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_prior_med.csv')
except FileNotFoundError:
    print("Incorrect path to prior medications file.")
try:
    prior_procedures = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_priorprocedures.csv')
except FileNotFoundError:
    print("Incorrect path to prior procedures file.")
try:
    ward_info = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_ward_info.csv')
except FileNotFoundError:
    print("Incorrect path to ward info file.")


try:
    microbial_resistance = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_microbial_resistance.csv')
except FileNotFoundError:
    print("Incorrect path to microbial resistance file.")


try:
    cohort = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_cohort.csv')
except FileNotFoundError:
    print("Incorrect path to cohort file.")
try:
    antibiotic_class_exposure = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_antibiotic_class_exposure.csv')
except FileNotFoundError:
    print("Incorrect path to antibiotic class exposure file.")
try:
    implied_susceptibility = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_implied_susceptibility.csv')
except FileNotFoundError:
   print("Incorrect path to implied susceptibility file.")


try:
    nursing_home_visits = pd.read_csv('/Users/fa/Documents/antimicrobial-susceptibility/microbiology_cultures_nursing_home_visits.csv')
except FileNotFoundError:
    print("Incorrect path to nursing home visits file.")
"""

# only in patients 

In [None]:
cohort = cohort[cohort['was_positive'] == 1]
cohort = cohort[cohort['ordering_mode'] == 'Inpatient']
cohort = cohort.drop(columns = ['ordering_mode', 'was_positive'])
cohort['year'] = [time[:4] for time in cohort['order_time_jittered_utc']]
cohort=cohort[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','order_time_jittered_utc','culture_description','year']]
cohort.drop_duplicates(inplace=True)
cohort.columns.values

In [None]:
implied_susceptibility=implied_susceptibility[(implied_susceptibility.organism!='Null')&(implied_susceptibility.antibiotic!='Null')]
implied_susceptibility.head()

In [None]:

implied_susceptibility=implied_susceptibility[(implied_susceptibility.susceptibility!='Null')|(implied_susceptibility.implied_susceptibility!='Null')]
implied_susceptibility

In [None]:
implied_susceptibility = implied_susceptibility[implied_susceptibility['antibiotic'].isin(['Cefazolin', 'Ceftriaxone', 'Cefepime', 'Piperacillin/Tazobactam', 'Ciprofloxacin'])]
implied_susceptibility.loc[
    implied_susceptibility['implied_susceptibility'] == 'Null', 
    'implied_susceptibility'
] = implied_susceptibility['susceptibility']
implied_susceptibility=implied_susceptibility[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','organism','antibiotic','implied_susceptibility']]
implied_susceptibility=implied_susceptibility[implied_susceptibility.implied_susceptibility.isin(['Susceptible', 'Resistant','Non Susceptible'])]
implied_susceptibility.loc[implied_susceptibility.implied_susceptibility=='Non Susceptible',
                           'implied_susceptibility'
                           ]='Resistant'
implied_susceptibility.implied_susceptibility.unique()

In [None]:
df = cohort.merge(implied_susceptibility, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df = df[df['implied_susceptibility'].isin(['Susceptible', 'Resistant'])]
df.head()

In [None]:
df = df.merge(demographics, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')

df.loc[df.age=='Null','age']=None
df.loc[df.gender=='Null','gender']=None
df.loc[df.gender=='0','gender']=0
df.loc[df.gender=='1','gender']=1

df.loc[df.age=='18–24 years','age']=1
df.loc[df.age=='25–34 years','age']=2
df.loc[df.age=='35–44 years','age']=3
df.loc[df.age=='45-54 years','age']=4
df.loc[df.age=='55-64 years','age']=5
df.loc[df.age=='65-74 years','age']=6
df.loc[df.age=='75-84 years','age']=7
df.loc[df.age=='85-89 years','age']=8
df.loc[df.age=='above 90','age']=9
df['age']=df['age'].astype(int)
df.age.unique()



In [None]:
ward_info=ward_info[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','hosp_ward_IP','hosp_ward_OP','hosp_ward_ER','hosp_ward_ICU']]
ward_info.drop_duplicates(inplace=True)
for col in ward_info.columns.values:
    ward_info.loc[ward_info[col]=='Null',col]=None
ward_info.hosp_ward_IP.unique()

In [None]:
adi_scores=adi_scores[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','adi_score']]
adi_scores.drop_duplicates(inplace=True)
for col in adi_scores.columns.values:
    adi_scores.loc[adi_scores[col]=='Null',col]=None
adi_scores.adi_score.unique()

In [None]:
df = df.merge(ward_info, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df = df.merge(adi_scores, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df.head()

In [None]:
for col in vitals.columns.values:
    if not col in ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']:
        vitals.loc[vitals[col]=='Null',col]=None
        vitals[col]=vitals[col].astype('float')
vitals.head()


In [25]:
df = df.merge(vitals, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')

In [None]:
nursing_home_visits=nursing_home_visits[nursing_home_visits.nursing_home_visit_culture>=0]
nursing_home_visits['nursing_home_visits_within6month']=nursing_home_visits['nursing_home_visit_culture'].apply(lambda x: 1 if  x <= 180 else 0)
nursing_home_visits=nursing_home_visits[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','nursing_home_visits_within6month']]
nursing_home_visits = nursing_home_visits.groupby(['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']).agg(
    nursing_visits_within_6mo =('nursing_home_visits_within6month', sum),
).reset_index()
nursing_home_visits['nursing_visits_within_6mo']=nursing_home_visits['nursing_visits_within_6mo'].astype(int)

df = df.merge(nursing_home_visits, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.head()

In [None]:
prior_procedures=prior_procedures[(prior_procedures.procedure_description!='Null')&(prior_procedures.procedure_description.notna())]
prior_procedures=prior_procedures[(prior_procedures.procedure_time_to_culturetime.notna())&(prior_procedures.procedure_time_to_culturetime!='Null')]
prior_procedures=prior_procedures[(prior_procedures.procedure_time_to_culturetime>=0)&(prior_procedures.procedure_time_to_culturetime<=180)]
procedures = prior_procedures.procedure_description.unique()

for procedure in procedures:
    prior_procedures[f"{procedure}_within_6mo"] = prior_procedures[prior_procedures['procedure_description'] == procedure]['procedure_time_to_culturetime'].apply(lambda x: 1 if x <= 180 else 0)
    

columns_to_sum = ['urethral_catheter_within_6mo','surgical_procedure_within_6mo','mechvent_within_6mo','cvc_within_6mo','parenteral_nutrition_within_6mo','dialysis_within_6mo']
agg_dict = {col: (col, 'sum') for col in columns_to_sum}
prior_procedures = prior_procedures.groupby(['anon_id','pat_enc_csn_id_coded','order_proc_id_coded']).agg(**agg_dict).reset_index() 
prior_procedures=prior_procedures[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','urethral_catheter_within_6mo','surgical_procedure_within_6mo','mechvent_within_6mo','cvc_within_6mo','parenteral_nutrition_within_6mo','dialysis_within_6mo']]
prior_procedures.drop_duplicates(inplace=True)
for col in ['urethral_catheter_within_6mo','surgical_procedure_within_6mo','mechvent_within_6mo','cvc_within_6mo','parenteral_nutrition_within_6mo','dialysis_within_6mo']:
    prior_procedures[col]=prior_procedures[col].astype(int)
prior_procedures

In [None]:
df = df.merge(prior_procedures, on =['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:,42:]=df.iloc[:,42:].fillna(0)
df.head()

In [29]:
microbial_resistance=microbial_resistance[(microbial_resistance.organism!='Null')&(microbial_resistance.organism.notna())]
microbial_resistance=microbial_resistance[(microbial_resistance.antibiotic!='Null')&(microbial_resistance.antibiotic.notna())]
microbial_resistance=microbial_resistance[(microbial_resistance.resistant_time_to_culturetime>=0)&(microbial_resistance.resistant_time_to_culturetime<=180)]
microbial_resistance=microbial_resistance[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','organism','antibiotic']]

# microbial resistance (game changer)
microbial_resistance = microbial_resistance[microbial_resistance['antibiotic'].isin(['Cefazolin', 'Ceftriaxone', 'Cefepime', 'Piperacillin/Tazobactam', 'Ciprofloxacin'])]

# filter for top 10 most common microbes
microbial_resistance = microbial_resistance[microbial_resistance['organism'].isin(['ESCHERICHIA COLI', 'PSEUDOMONAS AERUGINOSA', 'MUCOID PSEUDOMONAS AERUGINOSA', 
                                                                                   'KLEBSIELLA PNEUMONIAE', 'ACHROMOBACTER XYLOSOXIDANS', 'STAPHYLOCOCCUS AUREUS', 
                                                                                   'PSEUDOMONAS AERUGINOSA (NON-MUCOID CF)', 'ENTEROCOCCUS SPECIES', 'ENTEROBACTER CLOACAE COMPLEX', 'PROTEUS MIRABILIS'])]

microbial_resistance.drop_duplicates(inplace=True)
microbial_resistance['organism_resistance_antibiotic'] = microbial_resistance['organism'] + '_' + microbial_resistance['antibiotic']
microbial_resistance=microbial_resistance[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','organism_resistance_antibiotic']]
microbial_resistance.drop_duplicates(inplace=True)


In [None]:
microbial_resistance = microbial_resistance.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "organism_resistance_antibiotic"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
microbial_resistance.columns.name = None 
print(microbial_resistance)


In [None]:
df = df.merge(microbial_resistance, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 48:] = df.iloc[:, 48:].fillna(0)
df.head()

In [None]:
prior_infecting_organisms=prior_infecting_organisms[(prior_infecting_organisms.prior_infecting_organism_days_to_culutre<=180)&
                                                    (prior_infecting_organisms.prior_infecting_organism_days_to_culutre>=0)]

prior_infecting_organisms=prior_infecting_organisms[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','prior_organism']]
prior_infecting_organisms.drop_duplicates(inplace=True)
prior_infecting_organisms

In [None]:
prior_infecting_organisms = prior_infecting_organisms.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "prior_organism"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
prior_infecting_organisms.columns.name = None  
prior_infecting_organisms.columns = [f"prior_infected_{col}" if 3 <= i <= 19 else col for i, col in enumerate(prior_infecting_organisms.columns)]
prior_infecting_organisms = prior_infecting_organisms[prior_infecting_organisms.iloc[:, 3:20].gt(0).any(axis=1)]
print(prior_infecting_organisms)


In [None]:
df = df.merge(prior_infecting_organisms, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 79:] = df.iloc[:, 79:].fillna(0)
df.drop_duplicates(inplace=True)
df.head()

In [None]:
prior_med=prior_med[(prior_med.medication_time_to_culturetime>=0)&(prior_med.medication_time_to_culturetime<=180)]
prior_med=prior_med[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','medication_name']]
prior_med.drop_duplicates(inplace=True)
prior_med

In [None]:
prior_med = prior_med.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "medication_name"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
prior_med.columns.name = None 
prior_med.columns = [f"prior_med_{col}" if 3 <= i <= 23 else col for i, col in enumerate(prior_med.columns)]
prior_med = prior_med[prior_med.iloc[:, 3:23].gt(0).any(axis=1)]
print(prior_med)

In [None]:
df = df.merge(prior_med, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'],how='left')
df.iloc[:, 95:] = df.iloc[:, 95:].fillna(0)
df.head()

In [None]:
antibiotic_class_exposure=antibiotic_class_exposure[(antibiotic_class_exposure.time_to_culturetime>=0)&(antibiotic_class_exposure.time_to_culturetime<=180)]
antibiotic_class_exposure=antibiotic_class_exposure[['anon_id','pat_enc_csn_id_coded','order_proc_id_coded','antibiotic_class']]
antibiotic_class_exposure.drop_duplicates(inplace=True)
antibiotic_class_exposure

In [None]:
antibiotic_class_exposure = antibiotic_class_exposure.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "antibiotic_class"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
antibiotic_class_exposure.columns.name = None  
antibiotic_class_exposure.columns = [f"prior_abx_class_{col}" if 3 <= i else col for i, col in enumerate(antibiotic_class_exposure.columns)]
print(antibiotic_class_exposure)

In [None]:
df = df.merge(antibiotic_class_exposure, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 115:] = df.iloc[:, 115:].fillna(0)
df.head()

In [None]:
df = df.merge(labs, on = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded'], how = 'left')
df.head()

In [None]:
comorbidities=comorbidities[((comorbidities.comorbidity_component_start_days_culture<=180) & (comorbidities.comorbidity_component_start_days_culture>=0)) &
                            ((comorbidities.comorbidity_component_end_days_culture<0) | (comorbidities.comorbidity_component_end_days_culture.isna()))]


comorbidities=comorbidities[(comorbidities.comorbidity_component!='Null')&(comorbidities.comorbidity_component.notna())]
comorbidities=comorbidities[["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "comorbidity_component"]]
comorbidities.drop_duplicates(inplace=True)

comorbidities = comorbidities.groupby(["anon_id", "pat_enc_csn_id_coded", "order_proc_id_coded", "comorbidity_component"])\
              .size()\
              .unstack(fill_value=0)\
              .reset_index()
comorbidities.columns.name = None 
print(comorbidities)

In [None]:
df = df.merge(comorbidities, on = ['anon_id','pat_enc_csn_id_coded','order_proc_id_coded'], how = 'left')
df.iloc[:, 189:] = df.iloc[:, 189:].fillna(0)
df.head()

In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [33]:
df.to_csv('Model1_with_Comorbidity_components.csv',index=False)