In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
from scipy.stats import mannwhitneyu
from itertools import combinations
from statannotations.Annotator import Annotator
import matplotlib as mpl

from tableone import TableOne, load_dataset

sns.set_context('talk')
sns.set_theme(style="white")
sns.set(font_scale=1.2)
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100


# Load in data and simple renaming


In [None]:
data = pd.read_csv("dataset.csv.gz", index_col=0)

#the indeterminate cases were the few times no cell count sent; recode 
data.loc[data['Episode_etiology']=='Indeterminate','Episode_etiology'] = 'Micro-negative'
#rename culture negative to micro-negative
data.loc[data['Episode_etiology']=='Culture-negative','Episode_etiology'] = 'Micro-negative'
#label NPC episodes 
data.loc[data.Episode_category=='Non-PNA-ctrl','Episode_etiology']= 'Non-Pneumonia Control'

# Labeling different ANC cutoffs

In [None]:
#labeling each  day 
data['was_neutropenic_1500']=np.where(data.Neutrophils_x<1.5,True,False)
data['was_neutropenic_1000']=np.where(data.Neutrophils_x<1.0,True,False)
data['was_neutropenic_500']=np.where(data.Neutrophils_x<0.5,True,False)


In [None]:
# summarize features over admission

WBC_overadmission = data.groupby('patient').agg({
    'Neutrophils_x':'median', 
    'WBC_count':'median', 
    'was_neutropenic_1500':'max',
    'was_neutropenic_1000':'max',
    'was_neutropenic_500':'max',
    'bal_pct_neutro':'median'
    }).reset_index().rename(
    columns={
        'Neutrophils_x':'Neutrophils_median_overadmission', 
        'WBC_count': 'WBC_count_median_overadmission',
        'was_neutropenic_1500':'was_neutropenic_1500_overadmission',
        'was_neutropenic_1000':'was_neutropenic_1000_overadmission',
        'was_neutropenic_500':'was_neutropenic_500_overadmission',
        'bal_pct_neutro':'bal_pct_neutro_median_overadmission'    
        })
data = pd.merge(data, WBC_overadmission, how='left', on='patient')


## Creating immunocompetent, immunocompromised w/o neutro & neutropenic BY DAY

In [None]:
def annotate_immunocomp_today(row):
    if row.was_neutropenic_1500 == True:
        return 'Neutropenic'
    elif row.Imuunocompromised_flag == True and row.was_neutropenic_1500 == False:
        return 'Immunocompromised without neutropenia'
    elif row.Imuunocompromised_flag == False:
        return 'Immunocompetent'
    else:
        return 'Other'
    
data['immunocomp_today'] = [annotate_immunocomp_today(row) for index,row in data.iterrows()]
data.immunocomp_today.value_counts()

In [None]:
data.Imuunocompromised_flag.value_counts()

In [None]:
data.was_neutropenic_1500.value_counts()

In [None]:
# breaking out types of immunocompromise 

def create_additional_columns(df):
    # Initialize new columns with False
    df['solid_organ_transplant'] = False
    df['stem_cell_transplant'] = False
    df['acute_leukemia'] = False
    df['chemotherapy'] = False
    
    # Check for 'Solid organ transplant' and 'Acute leukemia' in 'type_immunocomp' column
    df['solid_organ_transplant'] = df['type_immunocomp'].str.contains('Solid organ transplant', case=False)
    df['stem_cell_transplant'] = df['type_immunocomp'].str.contains('Stem cell transplant', case=False)
    df['acute_leukemia'] = df['type_immunocomp'].str.contains('Acute leukemia', case=False)
    df['chemotherapy'] = df['type_immunocomp'].str.contains('Myelosuppressive chemotherapy', case=False)

    return df

# Apply the function to the 'dem' DataFrame
data = create_additional_columns(data)

data[['solid_organ_transplant', 'stem_cell_transplant', 'acute_leukemia', 'chemotherapy']]=data[[
    'solid_organ_transplant', 'stem_cell_transplant', 'acute_leukemia', 'chemotherapy']].fillna(False)

# Table One

In [None]:
single = data.drop_duplicates(subset='patient')

In [None]:
mytable = TableOne(single, columns=['Age','Gender','Imuunocompromised_flag',
                                    'solid_organ_transplant',
       'stem_cell_transplant', 'acute_leukemia', 'chemotherapy',
        'WBC_count_median_overadmission', 'Neutrophils_median_overadmission','bal_pct_neutro_median_overadmission', 
         'Cumulative_ICU_days','Binary_outcome'], 
                   
        categorical=['Gender','Imuunocompromised_flag', 'solid_organ_transplant',
       'stem_cell_transplant', 'acute_leukemia', 'chemotherapy','Binary_outcome'], 
                   
        nonnormal=['Age','Cumulative_ICU_days','WBC_count_median_overadmission', 
                   'Neutrophils_median_overadmission','bal_pct_neutro_median_overadmission'],
                #    groupby='new_immunocomp',pval=True,
                  overall=False,
        rename={
            'Imuunocompromised_flag' : 'Immunocompromised',
            'solid_organ_transplant' : 'Solid Organ Transplant',
            'stem_cell_transplant' : 'Stem Cell Transplant',
            'acute_leukemia' : 'Acute Leukemia',
            'chemotherapy' : 'Chemotherapy',
            'WBC_count_median_overadmission' : 'WBC Count',
            'Neutrophils_median_overadmission' : 'Neutrophil Count',
            'bal_pct_neutro_median_overadmission' : 'BAL % Neutrophils',
            'Cumulative_ICU_days' : 'Cumulative ICU Days',
            'Binary_outcome' : 'Unfavorable Outcome',
            'was_neutropenic_1500_overadmission' : 'Neutropenic',
               }
                )
mytable


In [None]:
data.bal_type.value_counts()

In [None]:
data.Episode_etiology.value_counts()

In [None]:
data.Episode_etiology.describe()

In [None]:
single.was_neutropenic_1500_overadmission.value_counts()

In [None]:
single.Imuunocompromised_flag.value_counts()

In [None]:
def annotate_immunocomp_admission(row):
    if row.was_neutropenic_1500_overadmission == True:
        return 'Neutropenic during admission'
    elif row.Imuunocompromised_flag == True and row.was_neutropenic_1500_overadmission == False:
        return 'Immunocompromised without neutropenia during admission'
    elif row.Imuunocompromised_flag == False:
        return 'Immunocompetent'
    else:
        return 'Other'
    
data['immunocomp_admission'] = [annotate_immunocomp_admission(row) for index,row in data.iterrows()]
data.drop_duplicates(subset='patient').immunocomp_admission.value_counts()

In [None]:
data[((data.Episode_etiology=='Bacterial') & (data.was_neutropenic_1500==True))]['bal_pct_neutro'].describe()

In [None]:
data[((data.Episode_etiology=='Bacterial') & (data.was_neutropenic_1500==True))]['bal_pct_neutro'].describe()

# Plotting


In [None]:
data['pathogen_bacteria'] = data['pathogen_bacteria'].map({True: 'Detected', False: 'Not Detected'})


In [None]:
fig, ax = plt.subplots(figsize = (12, 6))

#Colors
custom_palette = ["firebrick", "black"]  # Example colors, you can customize as needed


data = data
x = 'bal_pct_neutro'
y = 'Neutrophils_x'

data2=data[[x,y]].dropna()
                
# calculate the correlation value
corr, p = scipy.stats.spearmanr(data2[x], data2[y])

# create the regplot
sns.regplot(data=data2,x=x, y=y, scatter_kws={'s': 10})

sns.scatterplot(data=data, x='bal_pct_neutro', y='Neutrophils_x', hue='pathogen_bacteria', palette=custom_palette)

# add the correlation value to the plot
plt.text(0.5, .95, f'Spearman Correlation: {corr:.2f}, p<0.001 \n ', ha='center', va='center', transform=plt.gca().transAxes)


plt.axhline(y = 1.5, )
#Titles
plt.title("BAL % Neutrophils & Peripheral Neutrophils")
plt.xlabel("BAL % Neutrophils")
plt.ylabel("Peripheral Neutrophils")
plt.legend(title='Bacterial Pathogen')
sns.set_theme(style="white")

#Save image
plt.savefig('BALpercent_periphPMNs.pdf')

In [None]:
data.loc[data.was_neutropenic_1500, 'bal_pct_neutro'].describe()

In [None]:
data.loc[(data.was_neutropenic_1500) & (data.bal_pct_neutro<50)].shape

In [None]:
fig, ax = plt.subplots(figsize = (4, 4))
sns.stripplot(data=data[data.was_neutropenic_1500], y='bal_pct_neutro',)

plt.axhline(y = 50, )

In [None]:
states_order = ['Immunocompetent', 'Immunocompromised without neutropenia', 'Neutropenic']
subcat_order = ["Bacterial", "Viral", "Bacterial/viral", "Micro-negative", "Non-Pneumonia Control"]

subcat_palette = sns.color_palette("pastel",n_colors=5) #placeholder since not used
states_palette = sns.set_palette(["silver", "orange", "firebrick"])

x= 'Episode_etiology'
hue = 'immunocomp_today'

hue_plot_params = {
    'data': data,
    'x': 'Episode_etiology',
    'y': 'bal_pct_neutro',
    "order": subcat_order,
    "hue": "immunocomp_today",
    "hue_order": states_order,
    "palette": states_palette
}

pair_list = []
for c in data[x].dropna().unique():
    sub_combos = list(combinations(data[hue].dropna().unique(),2))
    for combo in sub_combos:
        pair_list.append([tuple([c, combo[0]]), tuple([c, combo[1]])])
pair_list

In [None]:

fig, ax = plt.subplots(figsize = (12, 6))

hue_plot_params = {
    'data': data,
    'x': 'Episode_etiology',
    'y': 'bal_pct_neutro',
    "order": subcat_order,
    "hue": "immunocomp_today",
    "hue_order": states_order,
    "palette": states_palette
}
pairs = pair_list

ax = sns.boxplot(ax=ax, **hue_plot_params)

annotator = Annotator(ax, pairs, **hue_plot_params)
annotator.configure(test="Mann-Whitney", verbose=2,text_format='simple',show_test_name=False, hide_non_significant=True).apply_and_annotate()

plt.legend(bbox_to_anchor=(.5, 1.15), loc='center', borderaxespad=0)


plt.title("BAL percent neutrophils by episode etiology in immunocompetent, immunocompromised, and neutropenic patients")
plt.xlabel("Episode etiology")
plt.ylabel("BAL percent neutrophils")
# sns.set_theme(style="white")
plt.show()

#Save image
fig.savefig('Categories_PMMNs.pdf', bbox_inches='tight')

In [None]:
data.immunocomp_today.value_counts()

In [None]:
data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Neutropenic'))]['bal_pct_neutro'].describe()

In [None]:
data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Neutropenic'))][['patient', 'day_bucket_starts','bal_pct_neutro', 'Neutrophils_x']]

In [None]:
data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Immunocompetent'))]['bal_pct_neutro'].describe()

In [None]:
data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Immunocompromised without neutropenia'))]['bal_pct_neutro'].describe()

In [None]:
# Splitting the data into two groups 
group1 = data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Immunocompromised without neutropenia'))]['bal_pct_neutro'].dropna()
group2 = data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Immunocompetent'))]['bal_pct_neutro'].dropna()

# Performing the Mann-Whitney U test
mannwhitney_result = mannwhitneyu(group1, group2)

# Calculate median and IQR for each group
median_group1 = group1.median()
q1_group1, q3_group1 = group1.quantile(0.25), group1.quantile(0.75)
median_group2 = group2.median()
q1_group2, q3_group2 = group2.quantile(0.25), group2.quantile(0.75)

# Print the results in a sentence
result_sentence = (
    "The Mann-Whitney U test revealed a *** in BAL percent neutrophils between \n"
    "*** patients with *** pneumonia (median [q1, q3]: {:.2f} [{:.2f}, {:.2f}]) \n"
    "and *** patients with *** pneumonia (median [q1, q3]: {:.2f} [{:.2f}, {:.2f}]), \n"
    "U statistic = {:.2f}, p-value = {:.4f}."
).format(median_group1, q1_group1, q3_group1, median_group2, q1_group2, q3_group2, mannwhitney_result.statistic, mannwhitney_result.pvalue)

print(result_sentence)

In [None]:
# Splitting the data into two groups 
group1 = data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Neutropenic'))]['bal_pct_neutro'].dropna()
group2 = data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Immunocompetent'))]['bal_pct_neutro'].dropna()

# Performing the Mann-Whitney U test
mannwhitney_result = mannwhitneyu(group1, group2)

# Calculate median and IQR for each group
median_group1 = group1.median()
q1_group1, q3_group1 = group1.quantile(0.25), group1.quantile(0.75)
median_group2 = group2.median()
q1_group2, q3_group2 = group2.quantile(0.25), group2.quantile(0.75)

# Print the results in a sentence
result_sentence = (
    "The Mann-Whitney U test revealed a *** in BAL percent neutrophils between \n"
    "*** patients with *** pneumonia (median [q1, q3]: {:.2f} [{:.2f}, {:.2f}]) \n"
    "and *** patients with *** pneumonia (median [q1, q3]: {:.2f} [{:.2f}, {:.2f}]), \n"
    "U statistic = {:.2f}, p-value = {:.4f}."
).format(median_group1, q1_group1, q3_group1, median_group2, q1_group2, q3_group2, mannwhitney_result.statistic, mannwhitney_result.pvalue)

print(result_sentence)

In [None]:
# Splitting the data into two groups 
group1 = data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Neutropenic'))]['bal_pct_neutro'].dropna()
group2 = data[((data.Episode_etiology=='Bacterial') & (data.immunocomp_today=='Immunocompromised without neutropenia'))]['bal_pct_neutro'].dropna()

# Performing the Mann-Whitney U test
mannwhitney_result = mannwhitneyu(group1, group2)

# Calculate median and IQR for each group
median_group1 = group1.median()
q1_group1, q3_group1 = group1.quantile(0.25), group1.quantile(0.75)
median_group2 = group2.median()
q1_group2, q3_group2 = group2.quantile(0.25), group2.quantile(0.75)

# Print the results in a sentence
result_sentence = (
    "The Mann-Whitney U test revealed a *** in BAL percent neutrophils between \n"
    "*** patients with *** pneumonia (median [q1, q3]: {:.2f} [{:.2f}, {:.2f}]) \n"
    "and *** patients with *** pneumonia (median [q1, q3]: {:.2f} [{:.2f}, {:.2f}]), \n"
    "U statistic = {:.2f}, p-value = {:.4f}."
).format(median_group1, q1_group1, q3_group1, median_group2, q1_group2, q3_group2, mannwhitney_result.statistic, mannwhitney_result.pvalue)

print(result_sentence)