In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patheffects
import seaborn as sns
import numpy as np
import itertools
import scipy.stats
import statsmodels.stats.multitest
import statannotations.Annotator
import math
import os
from scipy.stats import mannwhitneyu
import sys

sys.path.append('./../src/')

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
mpl.rcParams["figure.figsize"] = (10, 8)
mpl.rcParams['pdf.fonttype'] = 42  # edit-able in illustrator


import IPython.display
IPython.display.display(IPython.display.HTML("<style>.container { width:90% !important; }</style>"))


# load data

In [None]:
cd = pd.read_csv("carpediem.csv", index_col=0)
cd.sample()

In [None]:
cd.shape

# pulldown new data/updates


In [None]:
#pull down categories  

red = pd.read_csv("redcap_4339_pneumonia_episode_category_assessment.csv")
first_ep = red.loc[red.category_num==1, :]
first_ep=first_ep.fillna('')
first_ep['type'] = first_ep['clin_cap']+first_ep['clin_hap']+first_ep['clin_vap']
first_ep['virus']=first_ep['clin_cap_viral_npop']+first_ep['clin_hap_viral_npop']+first_ep['clin_vap_viral_npop']
new_dem = first_ep[['pt_study_id','ir_id','pt_category','virus','type']]
new_dem['virus'] = new_dem['virus'].astype(str)
new_dem['type'] = new_dem['type'].astype(str)

def category(row): 
    if (row['pt_category'].strip()=='Non-pneumonia control'):
        return 'Non-Pneumonia Control'
    elif 'SARS-Cov-2' in row['virus']:
        return 'COVID-19'
    elif 'Influenza' in row['virus']:
        return 'Influenza'
    elif (row['type'].strip()=='Viral/Etiology defined'):
        return 'Other Viral Pneumonia'
    elif (row['type'].strip()=='Bacterial/viral co-infection'):
        return 'Other Viral Pneumonia'    
    else:
        return 'Other Pneumonia'
new_dem['Patient_category'] = new_dem.apply(category, axis=1)

new_dem['COVID_status'] = np.where(new_dem['Patient_category'] == 'COVID-19', True, False)
new_dem['Influenza_status'] = np.where(new_dem['Patient_category'] == 'Influenza', True, False)

new_dem = new_dem[['pt_study_id','Patient_category', 'COVID_status','Influenza_status']]
cd = pd.merge(cd, new_dem, how='left', left_on=['patient'], right_on='pt_study_id')

#rename columns
cd.drop(columns=['Patient_category_x'], inplace=True)

# Rename the 'Patient_category_y' column to 'Patient_category'
cd.rename(columns={'Patient_category_y': 'Patient_category'}, inplace=True)

#rename columns
cd.drop(columns=['COVID_status_x'], inplace=True)

cd.rename(columns={'COVID_status_y': 'COVID_status'}, inplace=True)

# new immunocomp data/breakout

In [None]:
#pull down immunocompromised flag 

dem = pd.read_csv("redcap_4339_demographics.csv")

dem = dem[[ 'pt_study_id','pt_immunocomp', 'type_immunocomp','organ_transplant', ]]
dem=dem.rename(columns={'pt_study_id':'patient'})
dem=dem.rename(columns={'pt_immunocomp':'Immunocompromised_flag'})

def create_additional_columns(df):
    # Initialize new columns with False
    df['solid_organ_transplant'] = False
    df['stem_cell_transplant'] = False
    df['acute_leukemia'] = False
    df['chemotherapy'] = False
    
    # Check for 'Solid organ transplant' and 'Acute leukemia' in 'type_immunocomp' column
    df['solid_organ_transplant'] = df['type_immunocomp'].str.contains('Solid organ transplant', case=False)
    df['stem_cell_transplant'] = df['type_immunocomp'].str.contains('Stem cell transplant', case=False)
    df['acute_leukemia'] = df['type_immunocomp'].str.contains('Acute leukemia', case=False)
    df['chemotherapy'] = df['type_immunocomp'].str.contains('Myelosuppressive chemotherapy', case=False)

    return df

# Apply the function to the 'dem' DataFrame
dem = create_additional_columns(dem)

In [None]:
cd = pd.merge(cd, dem, how='left')
cd[['chemotherapy','acute_leukemia','stem_cell_transplant','solid_organ_transplant']]=cd[['chemotherapy','acute_leukemia','stem_cell_transplant','solid_organ_transplant']].fillna(False)

In [None]:
wbc_over_admission = cd.groupby('patient').agg({'Neutrophils':'min', 'WBC_count':'min'}).reset_index()
wbc_over_admission['neutropenic_during_admission'] = np.where(wbc_over_admission['Neutrophils']<1, True, False)
wbc_over_admission= wbc_over_admission[['patient', 'neutropenic_during_admission']]
cd = pd.merge(cd, wbc_over_admission, how='left', on='patient')

median_wbc_over_admission = cd.groupby('patient').agg({'WBC_count':'median', 'Neutrophils':'median'}).reset_index().rename(columns={'WBC_count':"median_WBC_over_admission",'Neutrophils':"median_PMNs_over_admission"})
cd = pd.merge(cd, median_wbc_over_admission, how='left', on='patient')

In [None]:
cd.shape

# serum galactomannan

In [None]:
serumgm = pd.read_excel("SCRIPT blood Aspegillus Galactomannan.xlsx")
serumgm = serumgm[['pt study id','specimen taken datetime','result txt']]
# Define a function to clean the 'gm' column
def clean_galactomannan(column):
    column = column.str.replace('>=', '')
    column = column.str.replace('>', '')
    column = column.str.replace(' ', '')
    column = column.str.replace('Negative', '')
    column = column.str.replace('Positive', '')
    column = column.str.replace(' ', '')
    return column

serumgm['result txt'] = clean_galactomannan(serumgm['result txt'])
serumgm['serumgm']=pd.to_numeric(serumgm['result txt'], errors='coerce')


#recode errors based on Epic checks 
serumgm.loc[serumgm['serumgm']==2.60, 'serumgm'] = np.nan
serumgm.loc[serumgm['serumgm']==1.09, 'serumgm']= np.nan
serumgm.loc[serumgm['serumgm']==10.07, 'serumgm']= np.nan

serumgm['specimen taken datetime']=pd.to_datetime(serumgm['specimen taken datetime'])
serumgm['day_bucket_starts']=serumgm['specimen taken datetime'].dt.date
serumgm['day_bucket_starts']=pd.to_datetime(serumgm['day_bucket_starts'])
serumgm= serumgm.dropna()
serumgm = serumgm.drop(columns=['specimen taken datetime','result txt'])
serumgm = serumgm.rename(columns={'pt study id':'patient'})
serumgm = serumgm.drop_duplicates(subset=['patient', 'day_bucket_starts'])
serumgm.sample(3)

In [None]:
cd['day_bucket_starts']=pd.to_datetime(cd['day_bucket_starts'])

In [None]:
cd = pd.merge(cd, serumgm, how='left', on=['patient', 'day_bucket_starts'])
cd.shape

In [None]:
cd.serumgm.describe()

In [None]:
fungitell = pd.read_csv("SCRIPT Aspegillus Galactomannan.csv")
#only select fungitell results
fungitell = fungitell[fungitell.procedure_name=='FUNGITELL(1-3)-BETA-D-GLUCAN ASSAY']

# Define a function to clean the 'gm' column
def clean_column(column):
    column = column.str.replace('>=', '')
    column = column.str.replace('>', '')
    column = column.str.replace(' ', '')
    column = column.str.replace('<', '')
    column = column.str.replace('Negative', '')
    column = column.str.replace('Positive', '')
    column = column.str.replace('NEGATIVE', '')
    column = column.str.replace('POSITIVE', '')
    column = column.str.replace(' ', '')
    return column

fungitell['result_cleaned'] = clean_column(fungitell['result_txt'])

fungitell['result_numeric']=pd.to_numeric(fungitell['result_cleaned'], errors='coerce')
fungitell['fungitell'] = fungitell['result_numeric']

fungitell['day_bucket_starts'] = pd.to_datetime(pd.to_datetime(fungitell.specimen_taken_datetime).dt.date)
fungitell= fungitell[['pt_study_id', 'day_bucket_starts','fungitell']]
fungitell=fungitell.dropna()
fungitell = fungitell.rename(columns={'pt_study_id':'patient'})

fungitell.sample(3)

In [None]:
cd = pd.merge(cd, fungitell, how='left', on=['patient', 'day_bucket_starts'])
cd.shape

In [None]:
cd.fungitell.describe()

In [None]:
cd.serumgm.describe()

# add medication data

In [None]:
#read in meds results
meds = pd.read_csv('medication_administration.csv.gz', encoding= 'unicode_escape')
antifungal=meds[meds['catalog_type']=='Antifungal']
antifungal = antifungal[['patient_ir_id','base_medication_name','administration_date',]].drop_duplicates() #just care yes/no for each day
# antifungal.pivot_table(values='base_medication_name', index=["patient_ir_id", "administration_date"]).reset_index()
# antifungal = antifungal.pivot_table(values='base_medication_name', index=["patient_ir_id", "administration_date"]).reset_index()
antifungal['received_antifungal_thisday'] = 1
antifungal = antifungal[['patient_ir_id', 'administration_date', 'received_antifungal_thisday']].drop_duplicates()

#link study ID to ir_id
patient = pd.read_csv('patient.csv.gz')
patient = patient.rename(columns={'case_number':'patient'})
patient = patient[['patient_ir_id','patient']]
cd = pd.merge(cd, patient, how='left', on='patient')

#joinback
cd['day_bucket_starts'] = pd.to_datetime(cd['day_bucket_starts'])
antifungal.administration_date = pd.to_datetime(antifungal.administration_date)
antifungal=antifungal.rename(columns={'administration_date':'day_bucket_starts'})

cd = pd.merge(cd, antifungal, how='left', on=['patient_ir_id','day_bucket_starts'])

#summarize over each admission
received_af_during_admission = cd.groupby('patient').agg(received_antifungal_thisadmission=('received_antifungal_thisday', 'max')).reset_index()

cd = pd.merge(cd, received_af_during_admission, how='left', on='patient')

cd.received_antifungal_thisday=cd.received_antifungal_thisday.fillna(0)
cd.received_antifungal_thisadmission=cd.received_antifungal_thisadmission.fillna(0)

cd['days_of_antifungal_bytoday'] = cd.groupby('patient')['received_antifungal_thisday'].cumsum().values
cd['sum_steroids_bytoday'] = cd.groupby('patient')['Steroid_dose'].cumsum().values

In [None]:
cd.shape

## only anti-asp antifungal

In [None]:
#read in meds results
meds = pd.read_csv('medication_administration.csv.gz', encoding= 'unicode_escape')
antifungal=meds[meds['catalog_type']=='Antifungal']
antifungal = antifungal[['patient_ir_id','base_medication_name','administration_date',]].drop_duplicates() #just care yes/no for each day
anti_asp= antifungal[antifungal['base_medication_name'].str.contains(
    'posaconazole|voriconazole|Isavuconazonium|amphotericin', case=False, na=False)]
anti_asp['received_antiasp_antifungals'] =1
anti_asp = anti_asp[['patient_ir_id', 'administration_date','received_antiasp_antifungals']].drop_duplicates()
anti_asp.administration_date = pd.to_datetime(anti_asp.administration_date)
anti_asp=anti_asp.rename(columns={'administration_date':'day_bucket_starts'})

cd = pd.merge(cd, anti_asp, how='left', on=['patient_ir_id','day_bucket_starts'])

cd.shape

#summarize over each admission
received_antiasp_thisadmission = cd.groupby('patient').agg(received_antiasp_thisadmission=('received_antiasp_antifungals', 'max')).reset_index()

cd = pd.merge(cd, received_antiasp_thisadmission, how='left', on='patient')

cd.received_antiasp_antifungals=cd.received_antiasp_antifungals.fillna(0)
cd.received_antiasp_thisadmission=cd.received_antiasp_thisadmission.fillna(0)

cd['days_of_antiasp_bytoday'] = cd.groupby('patient')['received_antiasp_antifungals'].cumsum().values

cd.sample(5)

In [None]:
#count days after first ICU day

cd.day_bucket_starts = pd.to_datetime(cd.day_bucket_starts)
first_icu_stay_day = cd.groupby(['patient']).agg({"day_bucket_starts": "min"}).rename(columns={'day_bucket_starts':'first_icu_date'})
last_icu_stay_day = cd.groupby(['patient']).agg({"day_bucket_starts": "max"}).rename(columns={'day_bucket_starts':'last_icu_date'})
cd = pd.merge(cd, first_icu_stay_day, how='left', on='patient')
cd = pd.merge(cd, last_icu_stay_day, how='left', on='patient')
cd['day_after_first_icu_day'] = cd.day_bucket_starts-cd.first_icu_date
cd['day_after_first_icu_day']=cd['day_after_first_icu_day'].dt.days

# was fungal culture sent 

In [None]:
bal = pd.read_excel("SCRIPT BAL Results_1-9-24.xlsx")

In [None]:
bal.culture_fungal_w_smear_bal_organism_id_1.value_counts().head()

In [None]:
fungal_sent = bal[['ir_id', 'BAL_collection_date','culture_fungal_w_smear_bal_organism_id_1',]].dropna()
fungal_sent['fungal_culture_done']=1
fungal_sent=fungal_sent.drop_duplicates(subset=['ir_id', 'BAL_collection_date',])

fungal_sent['BAL_collection_date']=pd.to_datetime(fungal_sent['BAL_collection_date'])
cd = pd.merge(cd, fungal_sent, how='left', left_on=['patient_ir_id','day_bucket_starts'], right_on=['ir_id','BAL_collection_date'])

cd.fungal_culture_done.value_counts()

# new columns of interest

In [None]:
cd['days_on_ventilator'] = cd.groupby('patient')['Intubation_flag'].cumsum().values
cd['summed_nat_score_to_today'] = cd.groupby('patient')['NAT_score'].cumsum().values
cd['received_abx_thisday'] = np.where(cd['NAT_score'] == -2, False, True)
cd['summed_days_of_abx_to_today'] = cd.groupby('patient')['received_abx_thisday'].cumsum().values


In [None]:
def mean_nat_score(patient_df):
    patient_df['n_days'] = range(1, patient_df.shape[0] + 1)
    return patient_df.summed_nat_score_to_today / patient_df.n_days
cd['mean_nat_score_to_today'] =cd.groupby('patient').apply(mean_nat_score).values

In [None]:
def flag_first_row_above_05(dataframe, threshold=0.5):
    flagged_row_index = None

    for index, row in dataframe.iterrows():
        if row['BAL_galactomannan'] > threshold:
            flagged_row_index = index
            break  # Exit the loop after the first match
    
    if flagged_row_index is not None:
        # Create a new column 'flagged_row' where the first matching row is 'Yes' and others are 'No'
        dataframe['first_elevated_BAL_GM_05'] = 'No'
        dataframe.at[flagged_row_index, 'first_elevated_BAL_GM_05'] = 'Yes'
    else:
        # If no row matches the condition, set 'flagged_row' to 'No' for all rows
        dataframe['first_elevated_BAL_GM_05'] = 'No'

    return dataframe

cd = cd.groupby('patient').apply(flag_first_row_above_05)

In [None]:
def flag_first_row_above_1(dataframe, threshold=1):
    flagged_row_index = None

    for index, row in dataframe.iterrows():
        if row['BAL_galactomannan'] > threshold:
            flagged_row_index = index
            break  # Exit the loop after the first match
    
    if flagged_row_index is not None:
        # Create a new column 'flagged_row' where the first matching row is 'Yes' and others are 'No'
        dataframe['first_elevated_BAL_GM_1'] = 'No'
        dataframe.at[flagged_row_index, 'first_elevated_BAL_GM_1'] = 'Yes'
    else:
        # If no row matches the condition, set 'flagged_row' to 'No' for all rows
        dataframe['first_elevated_BAL_GM_1'] = 'No'

    return dataframe

cd = cd.groupby('patient').apply(flag_first_row_above_1)

In [None]:
cd[cd.first_elevated_BAL_GM_05=='Yes'].shape

In [None]:
cd[cd.first_elevated_BAL_GM_1=='Yes'].shape

In [None]:
def flag_first_row_aspergillus_growth(dataframe):
    flagged_row_index = None

    for index, row in dataframe.iterrows():
        if row['Pathogen_aspergillus_detected'] == True:
            flagged_row_index = index
            break  # Exit the loop after the first match
    
    if flagged_row_index is not None:
        # Create a new column 'flagged_row' where the first matching row is 'Yes' and others are 'No'
        dataframe['first_row_aspergillus_growth'] = 'No'
        dataframe.at[flagged_row_index, 'first_row_aspergillus_growth'] = 'Yes'
    else:
        # If no row matches the condition, set 'flagged_row' to 'No' for all rows
        dataframe['first_row_aspergillus_growth'] = 'No'

    return dataframe

cd = cd.groupby('patient').apply(flag_first_row_aspergillus_growth)

In [None]:
cd.BAL_galactomannan.describe()

In [None]:
cd.Pathogen_aspergillus_detected.value_counts()

In [None]:
cd.BAL_performed.value_counts()

In [None]:
cd.groupby('Pathogen_aspergillus_detected')['BAL_galactomannan'].describe()

In [None]:

#flag as long as one BAL grew aspergillus
grew_asp_during_admission = cd.sort_values(by=['Pathogen_aspergillus_detected'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
grew_asp_during_admission = grew_asp_during_admission[['patient', 'Pathogen_aspergillus_detected']]

#rename 
grew_asp_during_admission=grew_asp_during_admission.rename(columns={'Pathogen_aspergillus_detected':'grew_asp_during_admission'})

#join back to main df
cd = pd.merge(cd, grew_asp_during_admission, how='left', on='patient')

In [None]:
cd['gm_greaterthan_1'] = np.where(cd['BAL_galactomannan']>1.0, 1, 0)
cd['gm_greaterthan_08'] = np.where(cd['BAL_galactomannan']>0.8, 1, 0)
cd['gm_greaterthan_05'] = np.where(cd['BAL_galactomannan']>0.5, 1, 0)


In [None]:
cd['gm_greaterthan_1'].value_counts()

In [None]:
cd['gm_greaterthan_08'].value_counts()

In [None]:
cd['gm_greaterthan_05'].value_counts()

In [None]:
cd[(cd['gm_greaterthan_1']==1)&(cd['Pathogen_aspergillus_detected']==1)].shape

In [None]:
cd[(cd['gm_greaterthan_08']==1)&(cd['Pathogen_aspergillus_detected']==1)].shape

In [None]:
cd[(cd['gm_greaterthan_05']==1)&(cd['Pathogen_aspergillus_detected']==1)].shape

In [None]:
gm_grew = cd[(cd['grew_asp_during_admission']==1)][['patient', 'BAL_galactomannan','day_bucket_starts']].sort_values(by='patient', ascending=True).dropna()

In [None]:
gm_grew[gm_grew.BAL_galactomannan>1].patient.nunique()

In [None]:
gm_grew[gm_grew.BAL_galactomannan>0.8].patient.nunique()

In [None]:
gm_grew[gm_grew.BAL_galactomannan>0.5].patient.nunique()

In [None]:
#flag as long as one BAL greater than 0.5
gm_greaterthan_05 = cd.sort_values(by=['gm_greaterthan_05'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
gm_greaterthan_05 = gm_greaterthan_05[['patient', 'gm_greaterthan_05']]

#rename 
gm_greaterthan_05=gm_greaterthan_05.rename(columns={'gm_greaterthan_05':'atleastonebal_gm_greaterthan_05'})

#join back to main df
cd = pd.merge(cd, gm_greaterthan_05, how='left', on='patient')

In [None]:
gm_greaterthan_05.atleastonebal_gm_greaterthan_05.value_counts()

In [None]:
#flag as long as one BAL greater than 0.8
gm_greaterthan_08 = cd.sort_values(by=['gm_greaterthan_08'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
gm_greaterthan_08 = gm_greaterthan_08[['patient', 'gm_greaterthan_08']]

#rename 
gm_greaterthan_08=gm_greaterthan_08.rename(columns={'gm_greaterthan_08':'atleastonebal_gm_greaterthan_08'})

#join back to main df
cd = pd.merge(cd, gm_greaterthan_08, how='left', on='patient')

In [None]:
gm_greaterthan_08.atleastonebal_gm_greaterthan_08.value_counts()

In [None]:
#flag as long as one BAL greater than 1
gm_greaterthan_1 = cd.sort_values(by=['gm_greaterthan_1'], ascending=False).drop_duplicates(subset=['patient'], keep='first')

#columns of interest
gm_greaterthan_1 = gm_greaterthan_1[['patient', 'gm_greaterthan_1']]

#rename 
gm_greaterthan_1=gm_greaterthan_1.rename(columns={'gm_greaterthan_1':'atleastonebal_gm_greaterthan_1'})

#join back to main df
cd = pd.merge(cd, gm_greaterthan_1, how='left', on='patient')

In [None]:
gm_greaterthan_1.atleastonebal_gm_greaterthan_1.value_counts()

In [None]:
cd['serumgm'].describe()


In [None]:
cd.groupby('grew_asp_during_admission')['serumgm'].describe()

In [None]:
177+9

## fungitell by growth


In [None]:
cd.drop_duplicates(subset='patient').grew_asp_during_admission.value_counts()

In [None]:
cd.groupby('grew_asp_during_admission')['fungitell'].describe()

In [None]:
123+6


In [None]:
# what else did they grow at time of aspergillus?


In [None]:
import json
# Function to parse JSON and extract PCR and culture bacteria
def extract_bacteria(json_str):
    if isinstance(json_str, str):
        try:
            json_obj = json.loads(json_str)
            pcr_bacteria = json_obj['pcr'].get('bacteria', [])
            culture_bacteria = [organism['name'] for organism in json_obj['culture'].get('organisms', [])]
            return pcr_bacteria, culture_bacteria
        except json.JSONDecodeError:
            return None, None
    else:
        return None, None

# Apply the function to the 'json_data' column
cd[['pcr_bacteria', 'culture_bacteria']] = cd['Pathogen_results'].apply(lambda x: pd.Series(extract_bacteria(x)))


In [None]:
cd.loc[cd.Pathogen_aspergillus_detected==True,['patient','day_bucket_starts','pcr_bacteria', 'culture_bacteria']]

# other fungi

In [None]:
def get_fungus(pathogen_results):
    if not isinstance(pathogen_results, str):
        return []
    pathogens = json.loads(pathogen_results)
    result = []
    for pathogen in pathogens['fungal']:
        if pathogen['name']:
            result.append(pathogen['name'])
    return result

cd.Pathogen_results.apply(get_fungus).value_counts()

In [None]:
cd['fungal_results']=cd.Pathogen_results.apply(get_fungus)

In [None]:
cd['fungal_results'].value_counts()

In [None]:
cd['fungal_results'].explode().value_counts().to_csv("fungal_culture_results10-14-24.csv")

In [None]:
cd['fungal_results'].explode().value_counts()

In [None]:
cd['fungal_results_str'] = cd['fungal_results'].astype(str)

In [None]:
cd.loc[cd.fungal_results_str.str.contains('Blastomyces', na=False)]

In [None]:
cd.loc[cd.patient.isin(blasto_pt), 'BAL_galactomannan'].dropna()

In [None]:
cd.loc[cd.patient.isin(blasto_pt), 'fungitell'].dropna()

In [None]:
cd.fungitell.describe()

In [None]:
cd[['serumgm','BAL_galactomannan']].corr(method='spearman')

In [None]:
from scipy.stats import spearmanr

stat_calc = cd[['serumgm','BAL_galactomannan']].dropna()

# Calculate Spearman correlation and p-value
correlation, p_value = spearmanr(stat_calc['serumgm'], stat_calc['BAL_galactomannan'])

# Display the results
print(f"Spearman correlation: {correlation}")
print(f"P-value: {p_value}")

In [None]:
# figure size 
plt.figure(figsize=(5, 5))

data = cd
x = 'serumgm'
y = 'BAL_galactomannan'

data2=data[[x,y]].dropna()
                
# calculate the correlation value
corr, p = scipy.stats.spearmanr(data2[x], data2[y])

# create the regplot
sns.regplot(data=data2,x=x, y=y, scatter_kws={'s': 3})


# add the correlation value to the plot
plt.text(0.5, .95, f'Spearman Correlation: {corr:.2f}, p={p:.2f}\n ', ha='center', va='center', transform=plt.gca().transAxes)

plt.title("BAL Galactomannan vs. Serum Galactomannan correlation \n when checked on same day") 
plt.xlabel("serumgm")
plt.ylabel("BAL_galactomannan")

In [None]:
serumgm_overadmission = cd.groupby('patient').agg({'serumgm':'median'}).reset_index()
BAL_galactomannan_overadmission = cd.groupby('patient').agg({'BAL_galactomannan':'median'}).reset_index()
serum_BAL_GM_overadmission = pd.merge(serumgm_overadmission, BAL_galactomannan_overadmission, on='patient')

In [None]:

plt.figure(figsize=(5, 5))

data = serum_BAL_GM_overadmission
x = 'serumgm'
y = 'BAL_galactomannan'

data2=data[[x,y]].dropna()
                
# calculate the correlation value
corr, p = scipy.stats.spearmanr(data2[x], data2[y])

# create the regplot
sns.regplot(data=data2,x=x, y=y, scatter_kws={'s': 5})

# add the correlation value to the plot
plt.text(0.5, .95, f'Spearman Correlation: {corr:.2f}, {p:.2e} \n ', ha='center', va='center', transform=plt.gca().transAxes)

plt.title("BAL Galactomannan vs. Serum Galactomannan \n correlation when aggregated over admission") 
plt.xlabel("serumgm_overadmission")
plt.ylabel("BAL_galactomannan_overadmission")

In [None]:
serum_bal_ffill = cd[['day_bucket_starts', 'serumgm', 'BAL_galactomannan']]
# build in 5 day buffer 
serum_bal_ffill['serumgm']=serum_bal_ffill['serumgm'].fillna(method='ffill', limit=2)
serum_bal_ffill['BAL_galactomannan']=serum_bal_ffill['BAL_galactomannan'].fillna(method='ffill', limit=2)
serum_bal_ffill['serumgm']=serum_bal_ffill['serumgm'].fillna(method='bfill', limit=2)
serum_bal_ffill['BAL_galactomannan']=serum_bal_ffill['BAL_galactomannan'].fillna(method='bfill', limit=2)
serum_bal_ffill=serum_bal_ffill.dropna().drop_duplicates(subset=['serumgm', 'BAL_galactomannan'])

In [None]:
# figure size 
plt.figure(figsize=(5, 5))

data = serum_bal_ffill
x = 'serumgm'
y = 'BAL_galactomannan'

data2=data[[x,y]].dropna().drop_duplicates()
                
# calculate the correlation value
corr, p = scipy.stats.spearmanr(data2[x], data2[y])

# create the regplot
sns.regplot(data=data2,x=x, y=y, scatter_kws={'s': 3})


# add the correlation value to the plot
plt.text(0.5, .95, f'Spearman Correlation: {corr:.2f}, p={p:.2e}\n ', ha='center', va='center', transform=plt.gca().transAxes)

plt.title("BAL Galactomannan vs. Serum Galactomannan correlation \n when checked within 5 days of each other") 
plt.xlabel("Serum GM")
plt.ylabel("BAL galactomannan")

## serumgm over 0.5

In [None]:
cd.serumgm.describe()

In [None]:
serumgm_over05_overadmission = cd.loc[cd.serumgm>0.5]

# descriptors 

In [None]:
cd.drop_duplicates(subset='patient').COVID_status.value_counts()

In [None]:
cd.drop_duplicates(subset='patient').Immunocompromised_flag.value_counts()

In [None]:
cd.loc[(cd.BAL_performed) & (cd.COVID_status)].Pathogen_aspergillus_detected.value_counts()

In [None]:
14/(14+683)

In [None]:
# numbers

In [None]:
len(cd.drop_duplicates(subset='patient'))

In [None]:
#had a bal 
len(cd[cd.BAL_performed.notna()].drop_duplicates(subset='patient'))

In [None]:
#had a bal with GM sent
len(cd[cd.BAL_galactomannan.notna()].drop_duplicates(subset='patient'))

In [None]:
#grew aspergillus
len(cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[cd.atleastonebal_gm_greaterthan_05==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[cd.atleastonebal_gm_greaterthan_08==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[cd.atleastonebal_gm_greaterthan_1==1].drop_duplicates(subset='patient'))

In [None]:
len(cd[(cd.atleastonebal_gm_greaterthan_05==1)&(cd['Pathogen_aspergillus_detected']==1)].drop_duplicates(subset='patient'))

In [None]:
len(cd[(cd.atleastonebal_gm_greaterthan_08==1)&(cd['Pathogen_aspergillus_detected']==1)].drop_duplicates(subset='patient'))

In [None]:
len(cd[(cd.atleastonebal_gm_greaterthan_1==1)&(cd['Pathogen_aspergillus_detected']==1)].drop_duplicates(subset='patient'))

In [None]:
cd['icu_stay_start_datetime']=pd.to_datetime(cd['icu_stay_start_datetime'])

In [None]:
cd.icu_stay_start_datetime.describe()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')['COVID_status'].value_counts()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')['Immunocompromised_flag'].value_counts()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')['type_immunocomp'].value_counts()

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')[['patient','COVID_status','Immunocompromised_flag',]]

In [None]:
cd[cd.grew_asp_during_admission==1].drop_duplicates(subset='patient')[['patient','COVID_status','type_immunocomp',]]

In [None]:
cd.loc[(cd.grew_asp_during_admission==True) & (cd.received_antiasp_thisadmission==0)].drop_duplicates(subset='patient')

# BALs per patient graphs

In [None]:
cd['gm_sent'] = np.where(cd.BAL_galactomannan.notna(),1,0)

In [None]:
bals_per_patient = cd.groupby('patient').agg({'BAL_performed':'sum', 'gm_sent':'sum','fungal_culture_done':'sum','BAL_galactomannan':'median'}).reset_index().rename(columns={'BAL_performed':'number_of_bals_sent','gm_sent':'number_of_gms_sent', 'fungal_culture_done':'number_of_fungal_cultures', 'BAL_galactomannan':'median_BALGM_over_admission'})

In [None]:
bals_per_patient.describe()

In [None]:
cd = pd.merge(cd, bals_per_patient, how='left', on='patient')

In [None]:
cd.groupby('grew_asp_during_admission')['BAL_galactomannan'].describe()


In [None]:
fig, ax = plt.subplots(figsize=(12,4))

data= bals_per_patient
x = 'number_of_gms_sent'

fig = sns.histplot(data=data,  x=x, bins=17, discrete=True)

plt.xticks([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],)

ax.set_xlabel("Number of BAL GMs sent per patient", size=16)


plt.savefig('Aspergillus/1_GM_per_patient.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(12,4))

data= bals_per_patient
x = 'number_of_bals_sent'

fig = sns.histplot(data=data,  x=x, bins=19, discrete=True)

plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,20],)

ax.set_xlabel("Number of BALs sent per patient", size=16)


plt.savefig('Aspergillus/1_BALs_per_patient.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(12,4))

data= bals_per_patient
x = 'number_of_fungal_cultures'

fig = sns.histplot(data=data,  x=x, bins=19, discrete=True)

plt.xticks([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,20],)

ax.set_xlabel("Number of Fungal Cultures sent per patient", size=16)


plt.savefig('Aspergillus/1_fungals_per_patient.pdf')

# BAL GM by growth graphs

In [None]:
cd.groupby('grew_asp_during_admission').BAL_galactomannan.describe()

In [None]:
cd.BAL_galactomannan.describe()


In [None]:
fig, ax = plt.subplots(figsize=(6,6))

data = cd
x = 'grew_asp_during_admission'
y = 'BAL_galactomannan'
# color='asper_growth_flag'

stats_results = []

for d1, d2 in itertools.combinations(data[x].unique(), 2):
        days1 = data[y][data[x]==d1].dropna()
        days2 = data[y][data[x]==d2].dropna()
        if days1.size == 0 or days2.size == 0:
            continue
        pval = scipy.stats.mannwhitneyu(days1, days2).pvalue
        stats_results.append([d1, d2, days1.size, days2.size, pval])

stats_results = pd.DataFrame(stats_results, columns=["group1", "group2",
                                                   "group1_size", "group2_size", "pval"])
stats_results["pval_adj"] = statsmodels.stats.multitest.fdrcorrection(stats_results.pval, alpha=0.05)[1]
stat_results_sign = stats_results.loc[stats_results.pval_adj < 0.05, :]
pairs = []
for _, r in stat_results_sign.iterrows():
        pairs.append((r.group1, r.group2))


sns.boxplot(data=data, x=x, y=y, 
            # color=color, 
            showfliers=True)

# for a in ax.collections:
#     if isinstance(a, mpl.collections.PatchCollection):
#         # remove line surround each box
#         a.set_linewidth(0)
ax.set_ylabel("BAL galactomannan (ODI)", size=18)
ax.set_xlabel(" ", size=16)
ax.set_xticklabels(["Patient did not \n grow Aspergillus", "Patient grew Aspergillus \n at least once"], size=16)
# #ax.legend(loc="upper left")
# #ax.legend_.set_bbox_to_anchor((1, 0.8))
# ax.tick_params(axis='x', labelsize=12)
# trans = mpl.transforms.Affine2D().translate(6, 0)
# for t in ax.get_xticklabels():
#     t.set_rotation(30)
#     t.set_horizontalalignment("right")
#     t.set_transform(t.get_transform() + trans)
# ax.set_title("ICU day first intubated", size=16);

annotator = statannotations.Annotator.Annotator(
    ax, 
    pairs, 
    data=data, 
    x=x,
    y=y, 
    verbose=False
)
annotator._verbose = False
annotator.configure(line_width=1)
annotator.set_custom_annotations([f"p={x:.2e}" for x in stat_results_sign.pval_adj])
annotator.annotate();


plt.savefig('Aspergillus/2_BALGM_bypatient.pdf')

In [None]:
cd.groupby('Pathogen_aspergillus_detected').BAL_galactomannan.describe()

In [None]:
cd[cd.first_row_aspergillus_growth=='Yes'][['day_after_first_icu_day','days_on_ventilator','summed_days_of_abx_to_today','summed_nat_score_to_today','days_of_antifungal_bytoday','sum_steroids_bytoday']].describe().T

In [None]:
cd[cd.first_row_aspergillus_growth=='Yes'][['days_of_antifungal_bytoday',]]

In [None]:
cd[cd.first_elevated_BAL_GM_05=='Yes'][['day_after_first_icu_day','days_on_ventilator','summed_days_of_abx_to_today','summed_nat_score_to_today','days_of_antifungal_bytoday','sum_steroids_bytoday']].describe().T

In [None]:
cd[(cd.first_elevated_BAL_GM_05=='Yes') & (cd.days_of_antifungal_bytoday!=0)].shape

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

data = cd
x = 'Pathogen_aspergillus_detected'
y = 'BAL_galactomannan'
# color='asper_growth_flag'

stats_results = []

for d1, d2 in itertools.combinations(data[x].unique(), 2):
        days1 = data[y][data[x]==d1].dropna()
        days2 = data[y][data[x]==d2].dropna()
        if days1.size == 0 or days2.size == 0:
            continue
        pval = scipy.stats.mannwhitneyu(days1, days2).pvalue
        stats_results.append([d1, d2, days1.size, days2.size, pval])

stats_results = pd.DataFrame(stats_results, columns=["group1", "group2",
                                                   "group1_size", "group2_size", "pval"])
stats_results["pval_adj"] = statsmodels.stats.multitest.fdrcorrection(stats_results.pval, alpha=0.05)[1]
stat_results_sign = stats_results.loc[stats_results.pval_adj < 0.05, :]
pairs = []
for _, r in stat_results_sign.iterrows():
        pairs.append((r.group1, r.group2))


sns.boxplot(data=data, x=x, y=y, 
            # color=color, 
            showfliers=True)

# for a in ax.collections:
#     if isinstance(a, mpl.collections.PatchCollection):
#         # remove line surround each box
#         a.set_linewidth(0)
ax.set_ylabel("BAL galactomannan (ODI)", size=18)
ax.set_xlabel(" ", size=16)
ax.set_xticklabels(["BAL did not \n grow Aspergillus", "BAL grew Aspergillus"], size=16)
# #ax.legend(loc="upper left")
# #ax.legend_.set_bbox_to_anchor((1, 0.8))
# ax.tick_params(axis='x', labelsize=12)
# trans = mpl.transforms.Affine2D().translate(6, 0)
# for t in ax.get_xticklabels():
#     t.set_rotation(30)
#     t.set_horizontalalignment("right")
#     t.set_transform(t.get_transform() + trans)
# ax.set_title("ICU day first intubated", size=16);

annotator = statannotations.Annotator.Annotator(
    ax, 
    pairs, 
    data=data, 
    x=x,
    y=y, 
    verbose=False
)
annotator._verbose = False
annotator.configure(line_width=1)
annotator.set_custom_annotations([f"p={x:.2e}" for x in stat_results_sign.pval_adj])
annotator.annotate();


plt.savefig('Aspergillus/2_BALGM_byBAL.pdf')

In [None]:
cd[cd.Age.isna()] #good, everyone has admission age 

In [None]:
cd.shape

# categories

In [None]:
#define categories

#elevated GM but didn't grow aspergillus
cd['elevated_gm08_nogrowth_by_admission'] = np.where((cd.atleastonebal_gm_greaterthan_08==1) & (cd.grew_asp_during_admission==0),1,0)

def category_08(row): 
    if (row['elevated_gm08_nogrowth_by_admission']==1):
        return 'elevated_gm08_nogrowth_by_admission'
    elif (row['grew_asp_during_admission']==1):
        return 'grew_asp_during_admission'
    # elif (row['atleastonebal_gm_greaterthan_05']==1):
    #     return 'atleastonebal_gm_greaterthan_05'
    else:
        return 'no_elevated_gm'
cd['category_08'] = cd.apply(category_08, axis=1)

In [None]:
#define categories

#elevated GM 05 but didn't grow aspergillus
cd['elevated_gm05_nogrowth_by_admission'] = np.where((cd.atleastonebal_gm_greaterthan_05==1) & (cd.grew_asp_during_admission==0),1,0)

def category_05(row): 
    if (row['elevated_gm05_nogrowth_by_admission']==1):
        return 'elevated_gm05_nogrowth_by_admission'
    elif (row['grew_asp_during_admission']==1):
        return 'grew_asp_during_admission'
    # elif (row['atleastonebal_gm_greaterthan_05']==1):
    #     return 'atleastonebal_gm_greaterthan_05'
    else:
        return 'no_elevated_gm'
cd['category_05'] = cd.apply(category_05, axis=1)

In [None]:
#define categories
cd['elevated_gm1_nogrowth_by_admission'] = np.where((cd.atleastonebal_gm_greaterthan_1==1) & (cd.grew_asp_during_admission==0),1,0)

def category_1(row): 
    if (row['elevated_gm1_nogrowth_by_admission']==1):
        return 'elevated_gm1_nogrowth_by_admission'
    elif (row['grew_asp_during_admission']==1):
        return 'grew_asp_during_admission'
    # elif (row['atleastonebal_gm_greaterthan_05']==1):
    #     return 'atleastonebal_gm_greaterthan_05'
    else:
        return 'no_elevated_gm'
cd['category_1'] = cd.apply(category_1, axis=1)

# bronch vs nonbronch 

In [None]:
redcap_bals = pd.read_csv("redcap_4339_bal_sample.csv")

In [None]:
redcap_bals = redcap_bals[['pt_study_id','bal_dt','bal_barcode','bal_method']]
redcap_bals

In [None]:
redcap_bals.drop_duplicates(subset=['pt_study_id','bal_dt']).shape

In [None]:
redcap_bals.bal_method.value_counts()

In [None]:
balsonly = cd.loc[cd.BAL_performed, ['patient','patient_ir_id','day_bucket_starts','Pathogen_aspergillus_detected','BAL_galactomannan']]

In [None]:
balsonly['day_bucket_starts']=pd.to_datetime(balsonly['day_bucket_starts'])
redcap_bals['bal_dt']=pd.to_datetime(redcap_bals['bal_dt'])

In [None]:
balsonly.shape

In [None]:
merge = pd.merge(balsonly,
                 redcap_bals,
                 how='left',
                 left_on=['patient', 'day_bucket_starts'],
                 right_on=['pt_study_id','bal_dt'])

In [None]:
merge.shape

In [None]:
merge.bal_method.value_counts(dropna=False)

In [None]:
old_bal_report = pd.read_csv("SCRIPT BAL Results.csv", skiprows=2)
old_bal_report = old_bal_report[['ir_id','BAL_collection_date','procedure_name']]
old_bal_report['BAL_collection_date'] = pd.to_datetime(old_bal_report['BAL_collection_date'])
merge = pd.merge(balsonly,
                 old_bal_report,
                 how='left',
                 left_on=['patient_ir_id', 'day_bucket_starts'],
                 right_on=['ir_id','BAL_collection_date'])
merge = merge.drop_duplicates(subset=['patient','day_bucket_starts'])
merge.procedure_name.value_counts(dropna=False)


In [None]:
merge2 = pd.merge(merge,
                 redcap_bals,
                 how='left',
                 left_on=['patient', 'day_bucket_starts'],
                 right_on=['pt_study_id','bal_dt'])

In [None]:
merge2.shape

In [None]:
merge2['nbbal'] = merge2.apply(lambda row: True if 'Nonbronchoscopic' in str(row['bal_method']) or 'NON-BRONCHOSCOPY' in str(row['procedure_name']) else '', axis=1)
merge2['nbbal'].value_counts()

In [None]:
merge2['bbal'] = merge2.apply(lambda row: True if 'Bronchoscopic' in str(row['bal_method']) or 'BRONCHALVEOLAR LAVAGE' in str(row['procedure_name']) else '', axis=1)
merge2['bbal'].value_counts()

In [None]:
merge2.loc[(merge2.nbbal==True) & (merge2.bbal==True)].shape

In [None]:
merge2.shape

In [None]:
merge2['bal_method'] = 'undocumented'  # Default value if neither is True

# Updating 'bal_method' based on conditions
merge2.loc[(merge2['nbbal'] == True), 'bal_method'] = 'nbbal'
merge2.loc[(merge2['bbal'] == True), 'bal_method'] = 'bbal'
merge2.loc[(merge2['nbbal'] == True) & (merge2['bbal'] == True), 'bal_method'] = 'conflicting'

merge2['bal_method'] = merge2['bal_method'].astype('category')

merge2.bal_method.value_counts()


In [None]:
merge2.groupby('bal_method')['Pathogen_aspergillus_detected'].value_counts()

In [None]:
merge2.groupby('bal_method')['BAL_galactomannan'].describe()

In [None]:
merge2.loc[merge2.nbbal==True].BAL_galactomannan.describe()

# Tables

In [None]:
from tableone import TableOne

In [None]:
ecmo_during_admission = cd.groupby('patient').ECMO_flag.max().reset_index().rename(columns={'ECMO_flag':'ecmo_during_admission'})
cd = pd.merge(cd, ecmo_during_admission, how='left', on='patient')

In [None]:
crrt_during_admission = cd.groupby('patient').CRRT_flag.max().reset_index().rename(columns={'CRRT_flag':'crrt_during_admission'})
cd = pd.merge(cd, crrt_during_admission, how='left', on='patient')

In [None]:
levo_during_admission = cd.groupby('patient').Norepinephrine_flag.max().reset_index().rename(columns={'Norepinephrine_flag':'levo_during_admission'})
cd = pd.merge(cd, levo_during_admission, how='left', on='patient')

In [None]:
tail = cd.groupby('patient').tail(1)

In [None]:
#hide rare races 

tail.loc[tail.Race=='American Indian or Alaska Native','Race']='Unknown or Not Reported'
tail.loc[tail.Race=='Native Hawaiian or Other Pacific Islander','Race']='Unknown or Not Reported'
tail.loc[tail.Race=='Asian Indian','Race']='Asian'

In [None]:
#hide old ages
tail.loc[tail.Age>89, 'Age']=91

In [None]:
tail=tail.fillna(False)

## hospital vs ICU mortality

In [None]:
df = tail.copy()

# Convert to datetime first if not already done
df['Death_date'] = pd.to_datetime(df['Death_date'], errors='coerce')
df['icu_stay_stop_datetime'] = pd.to_datetime(df['icu_stay_stop_datetime'], errors='coerce')

# Collapse datetime to date for both columns
df['Death_date'] = df['Death_date'].dt.date
df['icu_stay_stop_datetime'] = df['icu_stay_stop_datetime'].dt.date

# Now apply the same logic as before to check if a patient died within the ICU stay
df['died_within_icu_stay'] = df.apply(lambda row: 1 if pd.notnull(row['Death_date']) and row['Death_date'] <= row['icu_stay_stop_datetime'] else 0, axis=1)



In [None]:

# Display the resulting dataframe with the new column
df.loc[df.Discharge_disposition=='Died',['patient_ir_id', 'Death_date', 'icu_stay_stop_datetime', 'died_within_icu_stay', 'Discharge_disposition']].dropna().died_within_icu_stay.value_counts()

In [None]:
253+25

In [None]:
253/287

In [None]:
25/287

## basic cohort demographics

In [None]:
#basic demographics entire cohort

mytable = TableOne(tail, 
columns=[

    'Age',
    'Gender',
    'Race',
    'Ethnicity',
    'Admit_SOFA_score',
    'Admit_APS_score',
    'COVID_status',
    'Influenza_status',
    'Immunocompromised_flag',
    'solid_organ_transplant',
    'stem_cell_transplant',
    'acute_leukemia',
    'chemotherapy',
    'neutropenic_during_admission',
    'levo_during_admission',
    'crrt_during_admission',
    'ecmo_during_admission',

       
        ], 
categorical=[ 
    'Ethnicity',
    'Gender',
    'Race',
    'COVID_status',
    'Influenza_status',
    'Immunocompromised_flag',
    'solid_organ_transplant',
    'stem_cell_transplant',
    'acute_leukemia',
    'chemotherapy',
    'neutropenic_during_admission',
    'levo_during_admission',
    'crrt_during_admission',
    'ecmo_during_admission',
            ], 
nonnormal=['Age', 
    'Admit_SOFA_score',
    'Admit_APS_score',],

rename={ 

    'Age': 'Age',
    'Ethnicity': 'Ethnicity',
    'Gender': 'Gender',
    'Race': 'Race',
    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission'

},

    missing=True, 
# groupby='grew_asp_during_admission', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/cohort_demographics.csv")
mytable


In [None]:

mytable = TableOne(serumgm_over05_overadmission.drop_duplicates(subset='patient'), 
columns=[

    'Age',
    'Gender',
    'Race',
    'Ethnicity',
    'Admit_SOFA_score',
    'Admit_APS_score',
    'COVID_status',
    'Influenza_status',
    'Immunocompromised_flag',
    'solid_organ_transplant',
    'stem_cell_transplant',
    'acute_leukemia',
    'chemotherapy',
    'neutropenic_during_admission',
       
        ], 
categorical=[ 
    'Ethnicity',
    'Gender',
    'Race',
    'COVID_status',
    'Influenza_status',
    'Immunocompromised_flag',
    'solid_organ_transplant',
    'stem_cell_transplant',
    'acute_leukemia',
    'chemotherapy',
    'neutropenic_during_admission',
            ], 
nonnormal=['Age', 
    'Admit_SOFA_score',
    'Admit_APS_score',],

rename={ 

    'Age': 'Age',
    'Ethnicity': 'Ethnicity',
    'Gender': 'Gender',
    'Race': 'Race',
    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission'

},

    missing=True, 
# groupby='grew_asp_during_admission', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/cohort_serumgm05.csv")
mytable

## COVID patients immunocomp

In [None]:
tail.sample()

In [None]:
tail.groupby('COVID_status')['Immunocompromised_flag'].value_counts()

In [None]:
37/(37+172)

In [None]:
tail.loc[tail.grew_asp_during_admission].groupby('COVID_status')['Immunocompromised_flag'].value_counts()

## breakdown grew asp or not

In [None]:
mytable = TableOne(tail, 
columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'median_PMNs_over_admission',
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission',
       'received_antiasp_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',   'Cumulative_ICU_days','median_PMNs_over_admission',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday','days_of_antiasp_bytoday', 'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'received_antiasp_thisadmission': 'Treated with Anti-Aspergillus Antifungals',
    'days_of_antiasp_bytoday':'Days of Anti-Aspergillus Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},


          missing=False,
groupby='grew_asp_during_admission', pval=True, #htest_name=True, 
overall=False,
                  )
mytable.to_csv("Aspergillus/grew_asp_during_admission.csv")
mytable


In [None]:
tail['risk_factor'] = tail[['Immunocompromised_flag', 'neutropenic_during_admission', 'COVID_status', 'Influenza_status']].any(axis=1)


In [None]:
tail.loc[tail.grew_asp_during_admission==True, ['patient','risk_factor', 'Patient_category','neutropenic_during_admission','Immunocompromised_flag','COVID_status', 'Influenza_status']]


In [None]:
tail.loc[((tail.atleastonebal_gm_greaterthan_08==True)&(tail.received_antifungal_thisadmission==1)), ['patient','risk_factor', 'Patient_category','neutropenic_during_admission','Immunocompromised_flag','COVID_status', 'Influenza_status']]


In [None]:
mytable = TableOne(tail,

columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'median_PMNs_over_admission',
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission',
       'received_antiasp_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',   'Cumulative_ICU_days','median_PMNs_over_admission',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday','days_of_antiasp_bytoday', 'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'received_antiasp_thisadmission': 'Treated with Anti-Aspergillus Antifungals',
    'days_of_antiasp_bytoday':'Days of Anti-Aspergillus Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},

          missing=False,overall=False,
groupby='category_05', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/category_05.csv")
mytable


In [None]:
mytable = TableOne(tail,
columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'median_PMNs_over_admission',
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission',
       'received_antiasp_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',   'Cumulative_ICU_days','median_PMNs_over_admission',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday','days_of_antiasp_bytoday', 'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'received_antiasp_thisadmission': 'Treated with Anti-Aspergillus Antifungals',
    'days_of_antiasp_bytoday':'Days of Anti-Aspergillus Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},

          missing=False,overall=False,
groupby='category_08', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/category_08.csv")
mytable


In [None]:
mytable = TableOne(tail,
columns=[
      'Age', 'COVID_status', 'Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'median_PMNs_over_admission',
       'Cumulative_ICU_days',   'Cumulative_intubation_days',     'Tracheostomy_flag',
       'hydrocortisone_equiv_over_admission','received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       'Discharge_disposition','Binary_outcome',
        ], 
categorical=['COVID_status','Influenza_status','Immunocompromised_flag','solid_organ_transplant',
 'stem_cell_transplant',
 'acute_leukemia',
 'chemotherapy',
 'neutropenic_during_admission','received_tocilizumab_during_admission',
       'Tracheostomy_flag','received_antifungal_thisadmission',
       'received_antiasp_thisadmission','Discharge_disposition','Binary_outcome',
            ], 
nonnormal=['Age',   'Cumulative_ICU_days','median_PMNs_over_admission',
        'Cumulative_intubation_days','hydrocortisone_equiv_over_admission','days_of_antifungal_bytoday','days_of_antiasp_bytoday', 'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
          ],

rename={ 

    'COVID_status': 'Admitted with COVID-19',
    'Influenza_status': 'Admitted with Influenza',
    'Immunocompromised_flag': 'Immunocompromised',
    'solid_organ_transplant': 'Solid Organ Transplant',
    'stem_cell_transplant': 'Stem Cell Transplant',
    'acute_leukemia': 'Leukemia',
    'chemotherapy': 'Chemotherapy',
    'neutropenic_during_admission': 'Neutropenic During Admission',
      'received_tocilizumab_during_admission': 'Received Tocilizumab',
    'median_PMNs_over_admission': 'Median PMNs over Admission',
    'Cumulative_ICU_days': 'Cumulative ICU Days',
    'Cumulative_intubation_days': 'Cumulative Intubation Days',
    'Tracheostomy_flag': 'Received Tracheostomy',
    'hydrocortisone_equiv_over_admission': 'Steroid dose over ICU admission (in Hydrocortisone Equivalents)',
    'received_antifungal_thisadmission': 'Treated with Antifungals',
    'days_of_antifungal_bytoday': 'Days of Antifungal Therapy',
    'received_antiasp_thisadmission': 'Treated with Anti-Aspergillus Antifungals',
    'days_of_antiasp_bytoday':'Days of Anti-Aspergillus Antifungal Therapy',
    'summed_nat_score_to_today': 'Summed NAT Score',
    'summed_days_of_abx_to_today': 'Days of Antibiotics Therapy',
    'Discharge_disposition': 'Discharge Disposition',
    'Binary_outcome': 'Unfavorable Outcome',   
},

          missing=False,overall=False,
groupby='category_1', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/category_1.csv")
mytable


In [None]:
tail.loc[tail.category_1=='elevated_gm1_nogrowth_by_admission'].groupby('received_antiasp_thisadmission').Binary_outcome.value_counts()

In [None]:

from scipy.stats import chi2_contingency
data = pd.DataFrame({
    'received_antiasp_thisadmission': [0.0, 0.0, 1.0, 1.0],
    'Binary_outcome': [0, 1, 0, 1],
    'count': [8, 8, 15, 11]
})

data

In [None]:

# Pivot the table to create a contingency matrix for the Chi-squared test
contingency_table = data.pivot(index='received_antiasp_thisadmission', columns='Binary_outcome', values='count')

# Perform the Chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Prepare results for display
chi2, p, dof, expected

## (A)	Features from a patient’s first day growing Aspergillus 

In [None]:
mytable = TableOne(cd[cd.first_row_aspergillus_growth=='Yes'],
columns=['day_after_first_icu_day','days_on_ventilator','sum_steroids_bytoday',
    'received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       
        ], 
categorical=[], 
nonnormal=[
    'day_after_first_icu_day','days_on_ventilator','sum_steroids_bytoday',
    'received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
],

rename={ 

},

          missing=False,overall=False,
# groupby='category_1', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/first_day_growth.csv")
mytable


In [None]:
cd[cd.first_row_aspergillus_growth=='Yes'].days_of_antifungal_bytoday.ne(0).sum()

In [None]:
cd[cd.first_row_aspergillus_growth=='Yes'].days_of_antiasp_bytoday.ne(0).sum()

In [None]:
mytable = TableOne(cd[cd.first_elevated_BAL_GM_1=='Yes'],
columns=['day_after_first_icu_day','days_on_ventilator','sum_steroids_bytoday',
    'received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
       
        ], 
categorical=[], 
nonnormal=[
    'day_after_first_icu_day','days_on_ventilator','sum_steroids_bytoday',
    'received_antifungal_thisadmission','days_of_antifungal_bytoday', 
       'received_antiasp_thisadmission','days_of_antiasp_bytoday',

       'summed_nat_score_to_today',  'summed_days_of_abx_to_today',
],

rename={ 

},

          missing=False,overall=False,
# groupby='category_1', pval=True, #htest_name=True, 
                  )
mytable.to_csv("Aspergillus/first_day_elevatedGM1.csv")
mytable


In [None]:
cd[cd.first_elevated_BAL_GM_1=='Yes'].days_of_antifungal_bytoday.ne(0).sum()

In [None]:
cd[cd.first_elevated_BAL_GM_1=='Yes'].days_of_antiasp_bytoday.ne(0).sum()

In [None]:
cd.to_csv("aspergillus_script_10-26-24.csv")

# example patient graphs

## more readable graphs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter data for the specific patient
patient = ****
data = cd[cd.patient == patient]


fig, ax = plt.subplots(figsize=(10, 4))  # Adjusted figure size for better readability


# Plot line for BAL_galactomannan
sns.lineplot(
    data=data,
    x='ICU_day',
    y='BAL_galactomannan',
    color='black',  # Use a neutral color for the line
    linewidth=2,  # Thicker line for better visibility
    ax=ax
)

# Scatter plot for Pathogen_aspergillus_detected
sns.scatterplot(
    data=data,
    x='ICU_day',
    y='BAL_galactomannan',
    hue='Pathogen_aspergillus_detected',
    palette=['skyblue', 'tab:red'],  # Distinct colors for aspergillus detection
    markers=['o', 's'],  # Different markers: circle for no detection, square for detection
    style='Pathogen_aspergillus_detected',  # Use different styles
    s=200,  # Increase marker size
    ax=ax
)

# Scatter plot for received_antifungal_thisday, plotting at y=-0.5
sns.scatterplot(
    data=data,
    x='ICU_day',
    y=-0.4,  # Fixed y-value to show antifungal treatment events
    hue='received_antiasp_antifungals',
    palette=['lightgrey', 'tab:green'],  # Different colors for no/yes antifungal treatment
    markers=['X', 'P'],  # Use 'X' for no treatment, 'P' (plus symbol) for treatment
    style='received_antiasp_antifungals',
    s=150,  # Larger marker size for visibility
    ax=ax
)

# Customize the axis labels
ax.set_ylabel("BAL galactomannan (ODI)", size=18)
ax.set_xlabel("ICU day", size=16)

# Customize y-ticks
plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8], fontsize=12)
ax.set_ylim(-1, 8)


plt.tight_layout()  # Automatically adjust the subplot parameters to give space to the legends

# Add separate legends for clarity
handles, labels = ax.get_legend_handles_labels()

# Flip the order of handles and labels for 'Pathogen_aspergillus_detected'
new_labels = ['Grew Aspergillus', 'Did not grow Aspergillus']  # Reversed order
legend1 = ax.legend(handles[:2][::-1], new_labels, bbox_to_anchor=(1.05, 1.0), loc='upper left', title='Aspergillus Detection', fontsize=12)
ax.add_artist(legend1)

# Legend for Antifungal Treatment
new_labels = ['Received anti-Aspergillus Antifungal', 'Did not receive antifungals']  # Reversed order
legend2 = ax.legend(handles[2:][::-1], new_labels,  bbox_to_anchor=(1.05, 0.6), loc='upper left', title='Received Anti-Aspergillus Antifungal', fontsize=12)

# Final adjustments and saving the plot
plt.savefig('Aspergillus/3_****.pdf')
plt.show()


# days of therapy before BALs

In [None]:
balsonly = cd.loc[cd.BAL_performed, ['patient','patient_ir_id','day_bucket_starts','Pathogen_aspergillus_detected','BAL_galactomannan', 'days_of_antifungal_bytoday', 'days_of_antiasp_bytoday']]

In [None]:
balsonly.BAL_galactomannan.describe()


In [None]:
first_bal = balsonly.drop_duplicates(subset='patient')


In [None]:
first_bal

In [None]:
sum(first_bal.days_of_antiasp_bytoday==0)

In [None]:
len(balsonly) - sum(balsonly.days_of_antifungal_bytoday==0)

In [None]:
len(balsonly) - sum(balsonly.days_of_antiasp_bytoday==0)

In [None]:
len(balsonly.loc[balsonly.BAL_galactomannan.notna()])

In [None]:
len(balsonly.loc[balsonly.BAL_galactomannan.notna()]) - sum(balsonly.loc[balsonly.BAL_galactomannan.notna()].days_of_antifungal_bytoday==0)

In [None]:
403/len(balsonly.loc[balsonly.BAL_galactomannan.notna()])

In [None]:
len(balsonly.loc[balsonly.BAL_galactomannan.notna()]) - sum(balsonly.loc[balsonly.BAL_galactomannan.notna()].days_of_antiasp_bytoday==0)

In [None]:
fungitell = cd[['pt_study_id',
 'patient_ir_id',
 'first_icu_date',
 'last_icu_date',
 'day_bucket_starts',
     'serumgm',
 'fungitell',
 'BAL_collection_date',
 'Pathogen_aspergillus_detected',
 'grew_asp_during_admission',
 'gm_greaterthan_1',
 'gm_greaterthan_08',
 'gm_greaterthan_05',
 'atleastonebal_gm_greaterthan_05',
 'atleastonebal_gm_greaterthan_08',
 'atleastonebal_gm_greaterthan_1',

]]

In [None]:
df = cd[[
 'pt_study_id',
 'patient_ir_id',
 'first_icu_date',
 'last_icu_date',
 'BAL_collection_date',
 'gm_greaterthan_1',
 'Pathogen_aspergillus_detected',
 'atleastonebal_gm_greaterthan_1',
 'grew_asp_during_admission',]]

In [None]:
df = df.dropna()

In [None]:
df.loc[df.Pathogen_aspergillus_detected==True,'patient_ir_id']

In [None]:
path = pd.read_csv("script_pathology_10-26-24.csv")

# pathology results

In [None]:
autopsy = path.loc[path.report_description.str.contains('autopsy',case=False, na=False)]

In [None]:
autopsy.loc[autopsy.patient_ir_id.isin(df.loc[df.Pathogen_aspergillus_detected==True,'patient_ir_id'].tolist())] 

In [None]:
surg_path = path.loc[path.report_description.str.contains('surg',case=False, na=False)]

In [None]:
surg_path.loc[surg_path.patient_ir_id.isin(df.loc[df.Pathogen_aspergillus_detected==True,'patient_ir_id'].tolist())] 

In [None]:
grew_asp = df.loc[df.Pathogen_aspergillus_detected==True]
grew_asp

In [None]:
grew_asp.columns

In [None]:
pd.merge(grew_asp, surg_path, how='left', on='patient_ir_id')

# surg path results

In [None]:

# Converting date columns to datetime format
surg_path['report_date'] = pd.to_datetime(surg_path['report_date'])
grew_asp['first_icu_date'] = pd.to_datetime(grew_asp['first_icu_date'])
grew_asp['last_icu_date'] = pd.to_datetime(grew_asp['last_icu_date'])

# Performing the left join on 'patient_ir_id'
merged_df = pd.merge(surg_path, grew_asp, on="patient_ir_id", how="left")

# Filtering rows where 'report_date' is within 'first_icu_date' and 'last_icu_date'
filtered_df = merged_df[
    (merged_df['report_date'] >= merged_df['first_icu_date']) &
    (merged_df['report_date'] <= merged_df['last_icu_date'])
]

filtered_df

In [None]:
merged_df['BAL_collection_date'] = pd.to_datetime(merged_df['BAL_collection_date'])

filtered_df = merged_df[
    (merged_df['report_date'] >= merged_df['BAL_collection_date'] - pd.Timedelta(days=30)) &
    (merged_df['report_date'] <= merged_df['BAL_collection_date'] + pd.Timedelta(days=30))
]

filtered_df

# time before ICU stay started

In [None]:
endpts = pd.read_csv("basic_endpoints.csv")

In [None]:
cd_pts = cd.patient.drop_duplicates().tolist()

In [None]:
endpts = endpts.loc[endpts.pt_study_id.isin(cd_pts)]

In [None]:
endpts['index_icu_start'] = pd.to_datetime(endpts['index_icu_start'])
endpts['admission_datetime'] = pd.to_datetime(endpts['admission_datetime'])

In [None]:
endpts['time_diff'] = (endpts['index_icu_start'] - endpts['admission_datetime'])


In [None]:
endpts['time_diff'].describe()

# blasto

In [None]:
blasto_pt = cd.loc[cd.fungal_results_str.str.contains('Blastomyces', na=False)].patient.tolist()

In [None]:
blasto_pt

# PJP

In [None]:
pjp = pd.read_excel("script1.0 pjp results20241018.xlsx")

In [None]:
pjp.loc[pjp.result_txt.str.contains('positive', case=False)]

In [None]:
pjp.NARRATIVE.value_counts()

In [None]:
fungitell.loc[fungitell.pt_study_id.isin([pjp_pos])]

# fungemia

In [None]:
bcx = pd.read_excel('blood culture results.xlsx')

In [None]:
bcx

In [None]:
bcx.loc[bcx.result_txt.str.contains('candida', case=False)].pt_study_id.nunique()

In [None]:
candidemia= bcx.loc[bcx.result_txt.str.contains('candida', case=False)].pt_study_id.drop_duplicates().tolist()

In [None]:
candidemia

In [None]:
fungitell.loc[fungitell.pt_study_id.isin(candidemia), ['pt_study_id','day_bucket_starts','fungitell']].dropna()

In [None]:
fungitell.loc[fungitell.fungitell>80].pt_study_id.nunique()

In [None]:
fungitell.loc[fungitell.fungitell>80].drop_duplicates()

In [None]:
not_other_fungus = fungitell.loc[~fungitell.pt_study_id.isin([pos_pjp, candidemia, blasto_pt])]

In [None]:
not_other_fungus.groupby('grew_asp_during_admission').fungitell.describe()

In [None]:
not_other_fungus.groupby('atleastonebal_gm_greaterthan_05')['fungitell'].describe()
