In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set_context('paper', font_scale=1.75)

# Preparation data

In [None]:
labs = pd.read_csv('data/eicu/labs_first_day.csv', index_col = [0, 1], header = [0, 1])
outcomes = pd.read_csv('data/eicu/outcomes_first_day.csv', index_col = 0)
labs.columns = labs.columns.droplevel() 

In [None]:
# For paper's tables
# labs = pd.read_csv('data/eicu/labs_first_day_subselection.csv', index_col = [0, 1])
# outcomes = pd.read_csv('data/eicu/outcomes_first_day_subselection.csv', index_col = 0)

### Analysis

In [None]:
labs

In [None]:
labs.groupby('Patient').count().mean().sort_values()[-20:].plot.barh(figsize = (5,7))

### How many different labs in this period ?

In [None]:
number_events = ((~labs.isna()).sum(axis = 1) > 0).groupby('Patient').sum()
number_events.describe()

### In hospial mortality

In [None]:
print("In hospital mortality: {:.2f} %".format(100 * outcomes.Death.mean()))
outcomes[['teachingstatus', 'Death']].groupby('teachingstatus').mean()

# Analysis per admission day

In [None]:
# Prepreocess ethnicity
ethnicity = outcomes.ethnicity.copy().replace({"Caucasian": 'White', 'African American': 'Black'})
ethnicity[(ethnicity != "Asian") & (ethnicity != "Hispanic") & (ethnicity != "Black") & (ethnicity != "White")] = 'Other'

outcomes.ethnicity = ethnicity

In [None]:
teaching = outcomes[outcomes.teachingstatus == 't']
nonteaching = outcomes[outcomes.teachingstatus == 'f']

labs_t = labs.loc[teaching.index]
labs_f = labs.loc[nonteaching.index]

In [None]:
los_mean_t = teaching['Time'].mean()
los_std_t = teaching['Time'].std()
los_mean_f = nonteaching['Time'].mean()
los_std_f = nonteaching['Time'].std()
los = pd.DataFrame({'Mean': [outcomes['Time'].mean(), los_mean_f, los_mean_t],
                    'Std': [outcomes['Time'].std(), los_std_f, los_std_t]}, index = ['Population', 'Non Teaching', 'Teaching']).T

death_t = teaching['Death'].mean()
death_f = nonteaching['Death'].mean()
death = pd.DataFrame({'':[outcomes['Death'].mean(), death_f, death_t]}, index = ['Population', 'Non Teaching', 'Teaching']).T*100

ethnicity_t = ethnicity.loc[teaching.index].value_counts() / len(ethnicity.loc[teaching.index])
ethnicity_f = ethnicity.loc[nonteaching.index].value_counts() / len(ethnicity.loc[nonteaching.index])
ethnicity = pd.concat({"Population": ethnicity.value_counts() / len(ethnicity), "Non Teaching": ethnicity_f, "Teaching": ethnicity_t}, 1)*100

gender_t = teaching['gender'].value_counts() / len(teaching)
gender_f = nonteaching['gender'].value_counts() / len(nonteaching)
gender = pd.concat({"Population": outcomes['gender'].value_counts() / len(outcomes), "Non Teaching": gender_f, "Teaching": gender_t}, 1)*100

In [None]:
analysis = pd.concat({'LOS': los,
           'Death': death,
           'Gender': gender,
           'Ethnicity': ethnicity})
print(analysis.to_latex(float_format="{:0.2f}".format))
analysis

In [None]:
count_mean = labs.groupby('Patient').count().mean()
count_std = labs.groupby('Patient').count().std()
value_mean = labs.groupby('Patient').mean().mean()
value_std = labs.groupby('Patient').mean().std()

In [None]:
tests = pd.DataFrame({('Number Test', 'Mean'): count_mean,
           ('Number Test', 'Std'):count_std, 
           ('Value', 'Mean'): value_mean, 
           ('Value', 'Std'): value_std})
print(tests.to_latex(float_format="{:0.2f}".format))
tests

In [None]:
count_mean_f = labs_f.groupby('Patient').count().mean()
count_std_f = labs_f.groupby('Patient').count().std()
value_mean_f = labs_f.groupby('Patient').mean().mean()
value_std_f = labs_f.groupby('Patient').mean().std()

count_mean_t = labs_t.groupby('Patient').count().mean()
count_std_t = labs_t.groupby('Patient').count().std()
value_mean_t = labs_t.groupby('Patient').mean().mean()
value_std_t = labs_t.groupby('Patient').mean().std()

In [None]:
import scipy.stats

In [None]:
tests = pd.DataFrame({('Number Test', 'Teaching', 'Mean'): count_mean_t, 
            ('Number Test', 'Non Teaching', 'Mean'): count_mean_f, 
            ('Number Test', 'Test', 'P Value'): [scipy.stats.ttest_ind(labs_f[i].groupby('Patient').count().dropna(), labs_t[i].groupby('Patient').count().dropna())[1] for i in labs_t.columns],
           ('Value', 'Teaching', 'Mean'): value_mean_t, 
           ('Value', 'Non Teaching', 'Mean'): value_mean_f,
           ('Value', 'Test', 'P Value'): [scipy.stats.ttest_ind(labs_f[i].groupby('Patient').mean().dropna(), labs_t[i].groupby('Patient').mean().dropna())[1] for i in labs_t.columns],
           
          })
print(tests.loc[labs.groupby('Patient').count().mean().sort_values().index].to_latex(float_format="{:0.3f}".format))
tests

In [None]:
ordered_test = labs.groupby('Patient').count().sum(1).to_frame(name = 'Total tests ordered').join(
    (outcomes.teachingstatus).rename("Institution").replace({'t': "Teaching", 'f': "Non teaching"})
    ).join(outcomes.Death.rename("Outcome").replace({True: "Death", False: "Discharge"}))

In [None]:
plt.figure(dpi = 1000)
ax = sns.violinplot(x = "Outcome", y = "Total tests ordered", hue = 'Institution', data = ordered_test, split = True, inner = 'quartile', palette = "Blues", cut = 0, fig_size = (10,5))
plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title = 'Institution')
plt.ylim(15, 60)

# Analyze available labs

Subselect features to use.

In [None]:
from FeatureAnalysis.analysis.eclat import eclat
from FeatureAnalysis.analysis.rendering import buildGraph

In [None]:
missing_data = labs.groupby('Patient').count()
missing_data[missing_data <= 0] = np.nan
missing_data[0 < missing_data] = 1

In [None]:
missing_data

In [None]:
features = eclat(missing_data, minCount = 80000)
print(features)
buildGraph(features, mainBranch = True).render('examples/small')

In [None]:
def next_feature(node):
    if len(node.children) == 0:
        return [node.name]
    return [node.name] + next_feature(node.children[0])

In [None]:
# Remove patients with no values
selection = labs[next_feature(features.children[0])][next_feature(features.children[0])]
selection = (selection.groupby('Patient').count() == 0).sum(1) == 0
selection = selection.index[selection]

In [None]:
labs[labs.index.get_level_values('Patient').isin(selection)][next_feature(features.children[0])]

In [None]:
labs_subselection = labs[labs.index.get_level_values('Patient').isin(selection)][next_feature(features.children[0])].dropna(how = 'all')
outcomes_subselection = outcomes[outcomes.index.get_level_values('Patient').isin(labs_subselection.index.get_level_values('Patient'))]

In [None]:
outcomes_subselection

In [None]:
# Update last observation time once removed potentially missing test
outcomes_subselection['Remaining'] = (outcomes_subselection.Time.loc[labs_subselection.index.get_level_values(0)] - labs_subselection.index.get_level_values(1)).groupby('Patient').last()

In [None]:
# Save subsets patients and labs
labs_subselection.to_csv('data/eicu/labs_first_day_subselection.csv')
outcomes_subselection.to_csv('data/eicu/outcomes_first_day_subselection.csv')