In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patheffects
import seaborn as sns
import numpy as np
import itertools
import scipy.stats
import statsmodels.stats.multitest
import statannotations.Annotator
import math
import os
from scipy.stats import mannwhitneyu
import sys

sys.path.append('./../src/')

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
mpl.rcParams["figure.figsize"] = (10, 8)
mpl.rcParams['pdf.fonttype'] = 42  # edit-able in illustrator


import IPython.display
IPython.display.display(IPython.display.HTML("<style>.container { width:90% !important; }</style>"))


# Load data

In [None]:
cd = pd.read_csv("04_internal.csv.gz", index_col=0)


In [None]:
cd.shape

# Pulldown new data/updates


In [None]:
#pull down new categories  

red = pd.read_csv("redcap_4339_pneumonia_episode_category_assessment.csv")
first_ep = red.loc[red.category_num==1, :]
first_ep=first_ep.fillna('')
first_ep['type'] = first_ep['clin_cap']+first_ep['clin_hap']+first_ep['clin_vap']
first_ep['virus']=first_ep['clin_cap_viral_npop']+first_ep['clin_hap_viral_npop']+first_ep['clin_vap_viral_npop']
new_dem = first_ep[['pt_study_id','ir_id','pt_category','virus','type']]
new_dem['virus'] = new_dem['virus'].astype(str)
new_dem['type'] = new_dem['type'].astype(str)

def category(row): 
    if (row['pt_category'].strip()=='Non-pneumonia control'):
        return 'Non-Pneumonia Control'
    elif 'SARS-Cov-2' in row['virus']:
        return 'COVID-19'
    elif 'Influenza' in row['virus']:
        return 'Influenza'
    elif (row['type'].strip()=='Viral/Etiology defined'):
        return 'Other Viral Pneumonia'
    elif (row['type'].strip()=='Bacterial/viral co-infection'):
        return 'Other Viral Pneumonia'    
    else:
        return 'Other Pneumonia'
new_dem['Patient_category'] = new_dem.apply(category, axis=1)

new_dem['COVID_status'] = np.where(new_dem['Patient_category'] == 'COVID-19', True, False)
new_dem['Influenza_status'] = np.where(new_dem['Patient_category'] == 'Influenza', True, False)

new_dem = new_dem[['pt_study_id','Patient_category', 'COVID_status','Influenza_status']]
cd = pd.merge(cd, new_dem, how='left', left_on=['patient'], right_on='pt_study_id')

#rename columns
cd.drop(columns=['Patient_category_x'], inplace=True)

# Rename the 'Patient_category_y' column to 'Patient_category'
cd.rename(columns={'Patient_category_y': 'Patient_category'}, inplace=True)

#rename columns
cd.drop(columns=['COVID_status_x'], inplace=True)

cd.rename(columns={'COVID_status_y': 'COVID_status'}, inplace=True)

# Immunocompromised data/breakout

In [None]:
#pull down immunocompromised flag 

dem = pd.read_csv("redcap_4339_demographics.csv")

dem = dem[[ 'pt_study_id','pt_immunocomp', 'type_immunocomp','organ_transplant', ]]
dem=dem.rename(columns={'pt_study_id':'patient'})
dem=dem.rename(columns={'pt_immunocomp':'Immunocompromised_flag'})

def create_additional_columns(df):
    # Initialize new columns with False
    df['solid_organ_transplant'] = False
    df['stem_cell_transplant'] = False
    df['acute_leukemia'] = False
    df['chemotherapy'] = False
    
    # Check for 'Solid organ transplant' and 'Acute leukemia' in 'type_immunocomp' column
    df['solid_organ_transplant'] = df['type_immunocomp'].str.contains('Solid organ transplant', case=False)
    df['stem_cell_transplant'] = df['type_immunocomp'].str.contains('Stem cell transplant', case=False)
    df['acute_leukemia'] = df['type_immunocomp'].str.contains('Acute leukemia', case=False)
    df['chemotherapy'] = df['type_immunocomp'].str.contains('Myelosuppressive chemotherapy', case=False)

    return df

# Apply the function to the 'dem' DataFrame
dem = create_additional_columns(dem)

In [None]:
dem.acute_leukemia.value_counts()

In [None]:
cd = pd.merge(cd, dem, how='left')
cd[['chemotherapy','acute_leukemia','stem_cell_transplant','solid_organ_transplant']]=cd[['chemotherapy','acute_leukemia','stem_cell_transplant','solid_organ_transplant']].fillna(False)

In [None]:
cd.drop_duplicates(subset='patient').Immunocompromised_flag.value_counts() #same as current, good

In [None]:
wbc_over_admission = cd.groupby('patient').agg({'Neutrophils':'min', 'WBC_count':'min'}).reset_index()
wbc_over_admission['neutropenic_during_admission'] = np.where(wbc_over_admission['Neutrophils']<1, True, False)
wbc_over_admission= wbc_over_admission[['patient', 'neutropenic_during_admission']]
cd = pd.merge(cd, wbc_over_admission, how='left', on='patient')

median_wbc_over_admission = cd.groupby('patient').agg({'WBC_count':'median', 'Neutrophils':'median'}).reset_index().rename(columns={'WBC_count':"median_WBC_over_admission",'Neutrophils':"median_PMNs_over_admission"})
cd = pd.merge(cd, median_wbc_over_admission, how='left', on='patient')

In [None]:
cd.shape

# Serum galactomannan

In [None]:
serumgm = pd.read_excel("SCRIPT blood Aspegillus Galactomannan.xlsx")
serumgm = serumgm[['pt study id','specimen taken datetime','result txt']]
# Define a function to clean the 'gm' column
def clean_galactomannan(column):
    column = column.str.replace('>=', '')
    column = column.str.replace('>', '')
    column = column.str.replace(' ', '')
    column = column.str.replace('Negative', '')
    column = column.str.replace('Positive', '')
    column = column.str.replace(' ', '')
    return column

serumgm['result txt'] = clean_galactomannan(serumgm['result txt'])
serumgm['serumgm']=pd.to_numeric(serumgm['result txt'], errors='coerce')

serumgm['specimen taken datetime']=pd.to_datetime(serumgm['specimen taken datetime'])
serumgm['day_bucket_starts']=serumgm['specimen taken datetime'].dt.date
serumgm['day_bucket_starts']=pd.to_datetime(serumgm['day_bucket_starts'])
serumgm= serumgm.dropna()
serumgm = serumgm.drop(columns=['specimen taken datetime','result txt'])
serumgm = serumgm.rename(columns={'pt study id':'patient'})
serumgm = serumgm.drop_duplicates(subset=['patient', 'day_bucket_starts'])
serumgm.sample(3)

In [None]:
fungitell = pd.read_csv("SCRIPT Aspegillus Galactomannan.csv")
#only select fungitell results
fungitell = fungitell[fungitell.procedure_name=='FUNGITELL(1-3)-BETA-D-GLUCAN ASSAY']

# Define a function to clean the 'gm' column
def clean_column(column):
    column = column.str.replace('>=', '')
    column = column.str.replace('>', '')
    column = column.str.replace(' ', '')
    column = column.str.replace('<', '')
    column = column.str.replace('Negative', '')
    column = column.str.replace('Positive', '')
    column = column.str.replace('NEGATIVE', '')
    column = column.str.replace('POSITIVE', '')
    column = column.str.replace(' ', '')
    return column

fungitell['result_cleaned'] = clean_column(fungitell['result_txt'])

fungitell['result_numeric']=pd.to_numeric(fungitell['result_cleaned'], errors='coerce')
fungitell['fungitell'] = fungitell['result_numeric']

fungitell['day_bucket_starts'] = pd.to_datetime(pd.to_datetime(fungitell.specimen_taken_datetime).dt.date)
fungitell= fungitell[['pt_study_id', 'day_bucket_starts','fungitell']]
fungitell=fungitell.dropna()
fungitell = fungitell.rename(columns={'pt_study_id':'patient'})

fungitell.sample(3)

In [None]:
cd = pd.merge(cd, fungitell, how='left', on=['patient', 'day_bucket_starts'])
cd.shape

# Medication data

In [None]:
#read in meds results
meds = pd.read_csv('medication_administration.csv.gz', encoding= 'unicode_escape')
antifungal=meds[meds['catalog_type']=='Antifungal']
antifungal = antifungal[['patient_ir_id','base_medication_name','administration_date',]].drop_duplicates() #just care yes/no for each day
antifungal.pivot_table(values='base_medication_name', index=["patient_ir_id", "administration_date"]).reset_index()
antifungal = antifungal.pivot_table(values='base_medication_name', index=["patient_ir_id", "administration_date"]).reset_index()
antifungal['received_antifungal_thisday'] = 1


#link study ID to ir_id

patient = pd.read_csv('patient.csv.gz')
patient = patient.rename(columns={'case_number':'patient'})
patient = patient[['patient_ir_id','patient']]
cd = pd.merge(cd, patient, how='left', on='patient')

#joinback
cd['day_bucket_starts'] = pd.to_datetime(cd['day_bucket_starts'])
antifungal.administration_date = pd.to_datetime(antifungal.administration_date)
antifungal=antifungal.rename(columns={'administration_date':'day_bucket_starts'})

cd = pd.merge(cd, antifungal, how='left', on=['patient_ir_id','day_bucket_starts'])
#summarize over each admission
received_af_during_admission = cd.groupby('patient').agg(received_antifungal_thisadmission=('received_antifungal_thisday', 'max')).reset_index()

cd = pd.merge(cd, received_af_during_admission, how='left', on='patient')

cd.received_antifungal_thisday=cd.received_antifungal_thisday.fillna(0)
cd.received_antifungal_thisadmission=cd.received_antifungal_thisadmission.fillna(0)

cd['days_of_antifungal_bytoday'] = cd.groupby('patient')['received_antifungal_thisday'].cumsum().values
cd['sum_steroids_bytoday'] = cd.groupby('patient')['Steroid_dose'].cumsum().values

In [None]:
#count days after first ICU day

cd.day_bucket_starts = pd.to_datetime(cd.day_bucket_starts)
first_icu_stay_day = cd.groupby(['patient']).agg({"day_bucket_starts": "min"}).rename(columns={'day_bucket_starts':'first_icu_date'})
last_icu_stay_day = cd.groupby(['patient']).agg({"day_bucket_starts": "max"}).rename(columns={'day_bucket_starts':'last_icu_date'})
cd = pd.merge(cd, first_icu_stay_day, how='left', on='patient')
cd = pd.merge(cd, last_icu_stay_day, how='left', on='patient')
cd['day_after_first_icu_day'] = cd.day_bucket_starts-cd.first_icu_date
cd['day_after_first_icu_day']=cd['day_after_first_icu_day'].dt.days

# Fungal culture 

In [None]:
bal = pd.read_excel("SCRIPT BAL Results.xlsx")

In [None]:
bal.culture_fungal_w_smear_bal_organism_id_1.value_counts().head()

In [None]:
fungal_sent = bal[['ir_id', 'BAL_collection_date','culture_fungal_w_smear_bal_organism_id_1',]].dropna()
fungal_sent['fungal_culture_done']=1
fungal_sent=fungal_sent.drop_duplicates(subset=['ir_id', 'BAL_collection_date',])

fungal_sent['BAL_collection_date']=pd.to_datetime(fungal_sent['BAL_collection_date'])
cd = pd.merge(cd, fungal_sent, how='left', left_on=['patient_ir_id','day_bucket_starts'], right_on=['ir_id','BAL_collection_date'])

cd.fungal_culture_done.value_counts()

# New columns of interest

In [None]:
cd['days_on_ventilator'] = cd.groupby('patient')['Intubation_flag'].cumsum().values
cd['summed_nat_score_to_today'] = cd.groupby('patient')['NAT_score'].cumsum().values
cd['received_abx_thisday'] = np.where(cd['NAT_score'] == -2, False, True)
cd['summed_days_of_abx_to_today'] = cd.groupby('patient')['received_abx_thisday'].cumsum().values


In [None]:
def mean_nat_score(patient_df):
    patient_df['n_days'] = range(1, patient_df.shape[0] + 1)
    return patient_df.summed_nat_score_to_today / patient_df.n_days
cd['mean_nat_score_to_today'] =cd.groupby('patient').apply(mean_nat_score).values

In [None]:
def flag_first_row_above_05(dataframe, threshold=0.5):
    flagged_row_index = None

    for index, row in dataframe.iterrows():
        if row['BAL_galactomannan'] > threshold:
            flagged_row_index = index
            break  # Exit the loop after the first match
    
    if flagged_row_index is not None:
        # Create a new column 'flagged_row' where the first matching row is 'Yes' and others are 'No'
        dataframe['first_elevated_BAL_GM_05'] = 'No'
        dataframe.at[flagged_row_index, 'first_elevated_BAL_GM_05'] = 'Yes'
    else:
        # If no row matches the condition, set 'flagged_row' to 'No' for all rows
        dataframe['first_elevated_BAL_GM_05'] = 'No'

    return dataframe

cd = cd.groupby('patient').apply(flag_first_row_above_05)

In [None]:
cd[cd.first_elevated_BAL_GM_05=='Yes'].shape

In [None]:
def flag_first_row_aspergillus_growth(dataframe):
    flagged_row_index = None

    for index, row in dataframe.iterrows():
        if row['Pathogen_aspergillus_detected'] == True:
            flagged_row_index = index
            break  # Exit the loop after the first match
    
    if flagged_row_index is not None:
        # Create a new column 'flagged_row' where the first matching row is 'Yes' and others are 'No'
        dataframe['first_row_aspergillus_growth'] = 'No'
        dataframe.at[flagged_row_index, 'first_row_aspergillus_growth'] = 'Yes'
    else:
        # If no row matches the condition, set 'flagged_row' to 'No' for all rows
        dataframe['first_row_aspergillus_growth'] = 'No'

    return dataframe

cd = cd.groupby('patient').apply(flag_first_row_aspergillus_growth)

In [None]:
cd.BAL_galactomannan.describe()

In [None]:
cd.Pathogen_aspergillus_detected.value_counts()

In [None]:
cd.BAL_performed.value_counts()

In [None]:
cd.groupby('Pathogen_aspergillus_detected')['BAL_galactomannan'].describe()

In [None]:
#flag as long as one BAL grew aspergillus
grew_asp_during_admission = cd.sort_values(by=['Pathogen_aspergillus_detected'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
grew_asp_during_admission = grew_asp_during_admission[['patient', 'Pathogen_aspergillus_detected']]

#rename 
grew_asp_during_admission=grew_asp_during_admission.rename(columns={'Pathogen_aspergillus_detected':'grew_asp_during_admission'})

#join back to main df
cd = pd.merge(cd, grew_asp_during_admission, how='left', on='patient')

In [None]:
cd['gm_greaterthan_1'] = np.where(cd['BAL_galactomannan']>1.0, 1, 0)
cd['gm_greaterthan_08'] = np.where(cd['BAL_galactomannan']>0.8, 1, 0)
cd['gm_greaterthan_05'] = np.where(cd['BAL_galactomannan']>0.5, 1, 0)


In [None]:
cd['gm_greaterthan_1'].value_counts()

In [None]:
cd['gm_greaterthan_08'].value_counts()

In [None]:
cd['gm_greaterthan_05'].value_counts()

In [None]:
cd[(cd['gm_greaterthan_1']==1)&(cd['Pathogen_aspergillus_detected']==1)].shape

In [None]:
cd[(cd['gm_greaterthan_08']==1)&(cd['Pathogen_aspergillus_detected']==1)].shape

In [None]:
cd[(cd['gm_greaterthan_05']==1)&(cd['Pathogen_aspergillus_detected']==1)].shape

In [None]:
gm_grew = cd[(cd['grew_asp_during_admission']==1)][['patient', 'BAL_galactomannan','day_bucket_starts']].sort_values(by='patient', ascending=True).dropna()

In [None]:
gm_grew[gm_grew.BAL_galactomannan>1].patient.nunique()

In [None]:
gm_grew[gm_grew.BAL_galactomannan>0.8].patient.nunique()

In [None]:
gm_grew[gm_grew.BAL_galactomannan>0.5].patient.nunique()

In [None]:
#flag as long as one BAL greater than 0.5
gm_greaterthan_05 = cd.sort_values(by=['gm_greaterthan_05'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
gm_greaterthan_05 = gm_greaterthan_05[['patient', 'gm_greaterthan_05']]

#rename 
gm_greaterthan_05=gm_greaterthan_05.rename(columns={'gm_greaterthan_05':'atleastonebal_gm_greaterthan_05'})

#join back to main df
cd = pd.merge(cd, gm_greaterthan_05, how='left', on='patient')

In [None]:
gm_greaterthan_05.atleastonebal_gm_greaterthan_05.value_counts()

In [None]:
#flag as long as one BAL greater than 0.8
gm_greaterthan_08 = cd.sort_values(by=['gm_greaterthan_08'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
gm_greaterthan_08 = gm_greaterthan_08[['patient', 'gm_greaterthan_08']]

#rename 
gm_greaterthan_08=gm_greaterthan_08.rename(columns={'gm_greaterthan_08':'atleastonebal_gm_greaterthan_08'})

#join back to main df
cd = pd.merge(cd, gm_greaterthan_08, how='left', on='patient')

In [None]:
gm_greaterthan_08.atleastonebal_gm_greaterthan_08.value_counts()

In [None]:
#flag as long as one BAL greater than 1
gm_greaterthan_1 = cd.sort_values(by=['gm_greaterthan_1'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
gm_greaterthan_1 = gm_greaterthan_1[['patient', 'gm_greaterthan_1']]

#rename 
gm_greaterthan_1=gm_greaterthan_1.rename(columns={'gm_greaterthan_1':'atleastonebal_gm_greaterthan_1'})

#join back to main df
cd = pd.merge(cd, gm_greaterthan_1, how='left', on='patient')

In [None]:
gm_greaterthan_1.atleastonebal_gm_greaterthan_1.value_counts()

In [None]:
cd.groupby('grew_asp_during_admission')['serumgm'].describe()

In [None]:
cd.groupby('grew_asp_during_admission')['fungitell'].describe()

# Descriptors 

In [None]:
cd.drop_duplicates(subset='patient').COVID_status.value_counts()

In [None]:
cd.drop_duplicates(subset='patient').Immunocompromised_flag.value_counts()

In [None]:
# numbers

In [None]:
len(cd.drop_duplicates(subset='patient'))

In [None]:
#had a bal 
len(cd[cd.BAL_performed.notna()].drop_duplicates(subset='patient'))

In [None]:
#had a bal with GM sent
len(cd[cd.BAL_galactomannan.notna()].drop_duplicates(subset='patient'))

In [None]:
#grew aspergillus
len(cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[cd.atleastonebal_gm_greaterthan_05==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[cd.atleastonebal_gm_greaterthan_08==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[cd.atleastonebal_gm_greaterthan_1==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[(cd.atleastonebal_gm_greaterthan_05==1)&(cd['Pathogen_aspergillus_detected']==1)].drop_duplicates(subset='patient'))

In [None]:
len(cd[(cd.atleastonebal_gm_greaterthan_08==1)&(cd['Pathogen_aspergillus_detected']==1)].drop_duplicates(subset='patient'))

In [None]:
len(cd[(cd.atleastonebal_gm_greaterthan_1==1)&(cd['Pathogen_aspergillus_detected']==1)].drop_duplicates(subset='patient'))

In [None]:
cd['icu_stay_start_datetime']=pd.to_datetime(cd['icu_stay_start_datetime'])

In [None]:
cd.icu_stay_start_datetime.describe()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')['COVID_status'].value_counts()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')['Immunocompromised_flag'].value_counts()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')['type_immunocomp'].value_counts()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')[['patient','COVID_status','Immunocompromised_flag',]]

# BALs per patient graphs

In [None]:
cd['gm_sent'] = np.where(cd.BAL_galactomannan.notna(),1,0)

In [None]:
bals_per_patient = cd.groupby('patient').agg({'BAL_performed':'sum', 'gm_sent':'sum','fungal_culture_done':'sum','BAL_galactomannan':'median'}).reset_index().rename(columns={'BAL_performed':'number_of_bals_sent','gm_sent':'number_of_gms_sent', 'fungal_culture_done':'number_of_fungal_cultures', 'BAL_galactomannan':'median_BALGM_over_admission'})

In [None]:
bals_per_patient.describe()

In [None]:
cd = pd.merge(cd, bals_per_patient, how='left', on='patient')

In [None]:
cd.groupby('grew_asp_during_admission')['BAL_galactomannan'].describe()


In [None]:
fig, ax = plt.subplots(figsize=(12,4))

data= bals_per_patient
x = 'number_of_gms_sent'

fig = sns.histplot(data=data,  x=x, bins=17, discrete=True)

plt.xticks([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],)

ax.set_xlabel("Number of BAL GMs sent per patient", size=16)


plt.savefig('Aspergillus/1_GM_per_patient.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(12,4))

data= bals_per_patient
x = 'number_of_bals_sent'

fig = sns.histplot(data=data,  x=x, bins=19, discrete=True)

plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,20],)

ax.set_xlabel("Number of BALs sent per patient", size=16)


plt.savefig('Aspergillus/1_BALs_per_patient.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(12,4))

data= bals_per_patient
x = 'number_of_fungal_cultures'

fig = sns.histplot(data=data,  x=x, bins=19, discrete=True)

plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,20],)

ax.set_xlabel("Number of Fungal Cultures sent per patient", size=16)


# plt.savefig('Aspergillus/1_BALs_per_patient.pdf')

# BAL GM by growth graphs

In [None]:
cd.groupby('grew_asp_during_admission').BAL_galactomannan.describe()

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

data = cd
x = 'grew_asp_during_admission'
y = 'BAL_galactomannan'
# color='asper_growth_flag'

stats_results = []

for d1, d2 in itertools.combinations(data[x].unique(), 2):
        days1 = data[y][data[x]==d1].dropna()
        days2 = data[y][data[x]==d2].dropna()
        if days1.size == 0 or days2.size == 0:
            continue
        pval = scipy.stats.mannwhitneyu(days1, days2).pvalue
        stats_results.append([d1, d2, days1.size, days2.size, pval])

stats_results = pd.DataFrame(stats_results, columns=["group1", "group2",
                                                   "group1_size", "group2_size", "pval"])
stats_results["pval_adj"] = statsmodels.stats.multitest.fdrcorrection(stats_results.pval, alpha=0.05)[1]
stat_results_sign = stats_results.loc[stats_results.pval_adj < 0.05, :]
pairs = []
for _, r in stat_results_sign.iterrows():
        pairs.append((r.group1, r.group2))


sns.boxplot(data=data, x=x, y=y, 
            # color=color, 
            showfliers=True)

# for a in ax.collections:
#     if isinstance(a, mpl.collections.PatchCollection):
#         # remove line surround each box
#         a.set_linewidth(0)
ax.set_ylabel("BAL galactomannan (ODI)", size=18)
ax.set_xlabel(" ", size=16)
ax.set_xticklabels(["Patient did not \n grow Aspergillus", "Patient grew Aspergillus \n at least once"], size=16)
# #ax.legend(loc="upper left")
# #ax.legend_.set_bbox_to_anchor((1, 0.8))
# ax.tick_params(axis='x', labelsize=12)
# trans = mpl.transforms.Affine2D().translate(6, 0)
# for t in ax.get_xticklabels():
#     t.set_rotation(30)
#     t.set_horizontalalignment("right")
#     t.set_transform(t.get_transform() + trans)
# ax.set_title("ICU day first intubated", size=16);

annotator = statannotations.Annotator.Annotator(
    ax, 
    pairs, 
    data=data, 
    x=x,
    y=y, 
    verbose=False
)
annotator._verbose = False
annotator.configure(line_width=1)
annotator.set_custom_annotations([f"p={x:.2e}" for x in stat_results_sign.pval_adj])
annotator.annotate();


plt.savefig('Aspergillus/2_BALGM_bypatient.pdf')

In [None]:
cd.groupby('Pathogen_aspergillus_detected').BAL_galactomannan.describe()

In [None]:
cd[cd.first_row_aspergillus_growth=='Yes'][['day_after_first_icu_day','days_on_ventilator','summed_days_of_abx_to_today','summed_nat_score_to_today','days_of_antifungal_bytoday','sum_steroids_bytoday']].describe().T

In [None]:
cd[cd.first_row_aspergillus_growth=='Yes'][['days_of_antifungal_bytoday',]]

In [None]:
cd[cd.first_elevated_BAL_GM_05=='Yes'][['day_after_first_icu_day','days_on_ventilator','summed_days_of_abx_to_today','summed_nat_score_to_today','days_of_antifungal_bytoday','sum_steroids_bytoday']].describe().T

In [None]:
cd[(cd.first_elevated_BAL_GM_05=='Yes') & (cd.days_of_antifungal_bytoday!=0)].shape

In [None]:
cd.sample()

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

data = cd
x = 'Pathogen_aspergillus_detected'
y = 'BAL_galactomannan'
# color='asper_growth_flag'

stats_results = []

for d1, d2 in itertools.combinations(data[x].unique(), 2):
        days1 = data[y][data[x]==d1].dropna()
        days2 = data[y][data[x]==d2].dropna()
        if days1.size == 0 or days2.size == 0:
            continue
        pval = scipy.stats.mannwhitneyu(days1, days2).pvalue
        stats_results.append([d1, d2, days1.size, days2.size, pval])

stats_results = pd.DataFrame(stats_results, columns=["group1", "group2",
                                                   "group1_size", "group2_size", "pval"])
stats_results["pval_adj"] = statsmodels.stats.multitest.fdrcorrection(stats_results.pval, alpha=0.05)[1]
stat_results_sign = stats_results.loc[stats_results.pval_adj < 0.05, :]
pairs = []
for _, r in stat_results_sign.iterrows():
        pairs.append((r.group1, r.group2))


sns.boxplot(data=data, x=x, y=y, 
            # color=color, 
            showfliers=True)

# for a in ax.collections:
#     if isinstance(a, mpl.collections.PatchCollection):
#         # remove line surround each box
#         a.set_linewidth(0)
ax.set_ylabel("BAL galactomannan (ODI)", size=18)
ax.set_xlabel(" ", size=16)
ax.set_xticklabels(["BAL did not \n grow Aspergillus", "BAL grew Aspergillus"], size=16)
# #ax.legend(loc="upper left")
# #ax.legend_.set_bbox_to_anchor((1, 0.8))
# ax.tick_params(axis='x', labelsize=12)
# trans = mpl.transforms.Affine2D().translate(6, 0)
# for t in ax.get_xticklabels():
#     t.set_rotation(30)
#     t.set_horizontalalignment("right")
#     t.set_transform(t.get_transform() + trans)
# ax.set_title("ICU day first intubated", size=16);

annotator = statannotations.Annotator.Annotator(
    ax, 
    pairs, 
    data=data, 
    x=x,
    y=y, 
    verbose=False
)
annotator._verbose = False
annotator.configure(line_width=1)
annotator.set_custom_annotations([f"p={x:.2e}" for x in stat_results_sign.pval_adj])
annotator.annotate();


plt.savefig('Aspergillus/2_BALGM_byBAL.pdf')

In [None]:
cd[cd.Age.isna()] #good, everyone has admission age 

# Categories

In [None]:
#define categories

#elevated GM but didn't grow aspergillus
cd['elevated_gm08_nogrowth_by_admission'] = np.where((cd.atleastonebal_gm_greaterthan_08==1) & (cd.grew_asp_during_admission==0),1,0)

def category_08(row): 
    if (row['elevated_gm08_nogrowth_by_admission']==1):
        return 'elevated_gm08_nogrowth_by_admission'
    elif (row['grew_asp_during_admission']==1):
        return 'grew_asp_during_admission'
    # elif (row['atleastonebal_gm_greaterthan_05']==1):
    #     return 'atleastonebal_gm_greaterthan_05'
    else:
        return 'no_elevated_gm'
cd['category_08'] = cd.apply(category_08, axis=1)

In [None]:
#define categories

#elevated GM 05 but didn't grow aspergillus
cd['elevated_gm05_nogrowth_by_admission'] = np.where((cd.atleastonebal_gm_greaterthan_05==1) & (cd.grew_asp_during_admission==0),1,0)

def category_05(row): 
    if (row['elevated_gm05_nogrowth_by_admission']==1):
        return 'elevated_gm05_nogrowth_by_admission'
    elif (row['grew_asp_during_admission']==1):
        return 'grew_asp_during_admission'
    # elif (row['atleastonebal_gm_greaterthan_05']==1):
    #     return 'atleastonebal_gm_greaterthan_05'
    else:
        return 'no_elevated_gm'
cd['category_05'] = cd.apply(category_05, axis=1)

# Tables

In [None]:
from tableone import TableOne

In [None]:
tail = cd.groupby('patient').tail(1)

In [None]:
tail=tail.fillna(False)

In [None]:
#basic demographics entire cohort

mytable = TableOne(tail, 
columns=[

    'Age',
    'Gender',
    'Race',
    'Ethnicity',
    'COVID_status',
    'Influenza_status',
    'Immunocompromised_flag',
    'solid_organ_transplant',
    'stem_cell_transplant',
    'acute_leukemia',
    'chemotherapy',
    'neutropenic_during_admission'

       
        ], 
categorical=[ 
    'Ethnicity',
    'Gender',
    'Race',
    'COVID_status',
    'Influenza_status',
    'Immunocompromised_flag',
    'solid_organ_transplant',
    'stem_cell_transplant',
    'acute_leukemia',
    'chemotherapy',
    'neutropenic_during_admission'
            ], 
nonnormal=['Age', ],

rename={ 

    'Age': 'Age',
    'Ethnicity': 'Ethnicity',
    'Gender': 'Gender',
    'Race': 'Race',
    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission'

},

    missing=True, 
# groupby='grew_asp_during_admission', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/cohort_demographics.csv")
mytable


In [None]:
mytable = TableOne(tail, 
columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'median_PMNs_over_admission',
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',   'Cumulative_ICU_days','median_PMNs_over_admission',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday', 'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},


          missing=False,
groupby='grew_asp_during_admission', pval=True, #htest_name=True, 
overall=False,
                  )
mytable.to_csv("Aspergillus/grew_asp_during_admission.csv")
mytable


In [None]:
tail['risk_factor'] = tail[['Immunocompromised_flag', 'neutropenic_during_admission', 'COVID_status', 'Influenza_status']].any(axis=1)


In [None]:
mytable = TableOne(tail,

columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'BMI', 'Admit_APS_score','Admit_SOFA_score', 
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 'summed_nat_score_to_today', 'mean_nat_score_to_today', 'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',  'BMI', 'Admit_APS_score','Admit_SOFA_score', 'Cumulative_ICU_days',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday', 'summed_nat_score_to_today', 'mean_nat_score_to_today', 'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},

          missing=False,overall=False,
groupby='category_05', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/category_05.csv")
mytable


In [None]:
mytable = TableOne(tail,

columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'BMI', 'Admit_APS_score','Admit_SOFA_score', 
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 'summed_nat_score_to_today', 'mean_nat_score_to_today', 'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',  'BMI', 'Admit_APS_score','Admit_SOFA_score', 'Cumulative_ICU_days',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday', 'summed_nat_score_to_today', 'mean_nat_score_to_today', 'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},

          missing=False,overall=False,
groupby='category_08', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/category_08.csv")
mytable


# Timeline graphs

In [None]:
fig, ax = plt.subplots(figsize=(8,4))

patient = x
data = cd[cd.patient==patient]

sns.lineplot(
    data = data,
    x = 'ICU_day',
    y = 'BAL_galactomannan', 
)
sns.scatterplot(
    data = data,
    x = 'ICU_day',
    y = 'BAL_galactomannan',
    hue = 'Pathogen_aspergillus_detected', 
    palette = ['tab:blue','tab:red'])

sns.scatterplot(
    data = data,
    x = 'ICU_day',
    y=-0.5,
    hue = 'received_antifungal_thisday', 
    palette = ['tab:grey','tab:green']
)
ax.set_ylabel("BAL galactomannan (ODI)", size=18)
ax.set_xlabel("ICU day ", size=16)

plt.yticks([0,1, 2, 3, 4, 5, 6,7,8 ],)
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left',title='Grew Aspergillus' )


# plt.savefig('Aspergillus/3_ex.pdf')