In [1]:
import numpy as np
import pandas as pd
import csv
import os
from SepsisCheck import sepsischeck_utilities_for_pkl as su
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report as report
from sklearn.metrics import roc_auc_score as auroc
from sklearn.metrics import confusion_matrix

### Take a look at the results of sepsis check on patients

In [2]:
def convert_result_to_df(filename):
    result_dict = []
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            line_list = line.split()
            line_list = [s.strip(',') for s in line_list]
            line_list = [s.replace(',', '') for s in line_list]
            result_dict.append(dict({'Subject ID': line_list[2], 
                                        'Hadm_ID': line_list[4],
                                        'ts_ind': line_list[6],
                                        'Sepsis': line_list[8], 
                                        't_sepsis': line_list[10],
                                        't_sofa': line_list[12], 
                                        't_cultures': line_list[14], 
                                        't_IV': line_list[16], 
                                        't_sus': line_list[18]}))
    df = pd.DataFrame.from_records(result_dict)

    return df

In [3]:
# load data we classified
path = "../data/patient/mimic_iii_preprocessed_finetuning2.pkl"
data = pd.read_pickle(path)
#sort by ts_ind as that is how the results are sorted
data1 = data[1].sort_values(by=["ts_ind"])
#make ground truth for scoring, reset index after sorting. Index 0 -> ts_ind 0
ground_truth = data1["in_hospital_sepsis"].reset_index(drop=True)

#load positive and negative with IV and cultures feature from file -> faster than computing again

with open("./features/possible_predicitons/possible_pos_predictions.csv") as f:
    reader = csv.reader(f)
    possible_pos_predictions = [int(row[0]) for row in reader]
with open("./features/possible_predicitons/possible_neg_predictions.csv") as f:
    reader = csv.reader(f)
    possible_neg_predictions = [int(row[0]) for row in reader]

#indeces of all patients with IV and cultures
all_possible_ = possible_pos_predictions + possible_neg_predictions

#get labels for all patients that have IV and cultures
adj_ground_truth = ground_truth.loc[all_possible_]

# df for holding results
col = ["experiment", "AUROC", "AUROC_adj","precision_raw", "precision_adj", "recall_raw", "recall_adj", "f1_raw", "f1_adj", "support", "support_adj", "cm", "cm_adj"]
df = pd.DataFrame(columns=col)

In [4]:
def compute_results(path):
    # load results and get predicted arrays

    df = convert_result_to_df(path)

    # get ts_indexes of False and True Sepsis labels in results
    noSeps = df.loc[df["Sepsis"] == "False"]
    Seps = df.loc[df["Sepsis"] == "True"]
    neg_hadm_IDs = list(map(int, noSeps["ts_ind"]))
    pos_hadm_IDs = list(map(int, Seps["ts_ind"]))

    #make predicted df for scoring
    predicted = pd.DataFrame(index=df.index, columns=["pred"])
    predicted[predicted.index.isin(neg_hadm_IDs)] = 0
    predicted[predicted.index.isin(pos_hadm_IDs)] = 1

    # get all predictions for all patients that have IV and cultures
    adj_predicted = predicted.loc[all_possible_]



    # precision, recall f1 (fbeta=1.0) on raw data
    precision, recall, f1_score, support = score(y_true=ground_truth.values.astype(int), y_pred=predicted["pred"].values.astype(int), average="weighted")
    auroc_score = auroc(y_true=ground_truth.values.astype(int), y_score=predicted["pred"].values.astype(int), average="weighted")
    cm=confusion_matrix(ground_truth.values.astype(int), predicted["pred"].values.astype(int)).ravel()

    # precision, recall f1 (fbeta=1.0) on patients that include IV and cultures
    precision_adj, recall_adj, f1_score_adj, support_adj = score(y_true=adj_ground_truth.values.astype(int), y_pred=adj_predicted["pred"].values.astype(int), average="weighted")
    auroc_score_adj = auroc(y_true=adj_ground_truth.values.astype(int), y_score=adj_predicted["pred"].values.astype(int), average="weighted")
    cm_adj=confusion_matrix(adj_ground_truth.values.astype(int), adj_predicted["pred"].values.astype(int)).ravel()


    return auroc_score, auroc_score_adj, precision, recall, f1_score, support, precision_adj, recall_adj, f1_score_adj, support_adj, cm, cm_adj

Running the next cell will open each result in the results_path and *append* computed scores to the F1_report.csv! Running this multiple times will double etc. the file, so skip this if the F1_report.csv is already there.

In [None]:
results_path = "./results/on_finetune_data/"
directory = os.fsencode(results_path)
for file in os.listdir(directory):
    auroc_score, auroc_score_adj, precision, recall, f1_score, support, precision_adj, recall_adj, f1_score_adj, support_adj, cm, cm_adj = compute_results(os.path.join(directory, file))
    new_row = {"experiment":file,"AUROC":auroc_score,"AUROC_adj":auroc_score_adj,"precision_raw":precision, "precision_adj":precision_adj, "recall_raw":recall, "recall_adj":recall_adj, "f1_raw":f1_score, "f1_adj":f1_score_adj, "support":support, "support_adj":support_adj, "cm": cm, "cm_adj":cm_adj}
    df2 = df.append(new_row, ignore_index=True)
    df2.to_csv('./results/on_finetune_data/F1_report.csv', mode='a', index=False, header=False)

In [2]:
col = ["experiment", "AUROC", "AUROC_adj","precision_raw", "precision_adj", "recall_raw", "recall_adj", "f1_raw", "f1_adj", "support", "support_adj", "cm", "cm_adj"]
results = pd.read_csv("./results/on_finetune_data/F1_report.csv", names=col, header=None)

- 1: Reyna et. al (72 consecutive hours of Antibiotics)
- 2: Sepsis-3 (no consecutive hours needed)
- 3: grouped (find all possible pairs of IV, BC for suspicion and pairs of suspicion and sofa)
- 4: catchsus (keep looking at IV and BC until we find a pair that's suspicious)

* f1_raw is on all patients
* f1_adj is only on patients that have both Antibiotics and Blood Culture features (the patients that can be expected to be predicted correctly as it is impossible for the rule based SepsisCheck to output a positive sepsis label for a patient that is missing either one of those features.)

Even though grouped and catchsus outperform the standard strategies of Sepsis-3 and Reyna et. al in f1_raw scores, the standard strategies perform better for the f1_adj patients. Grouped and catchsus strategies seem to perform better due to data distribution rather than because of being a better strategy.


In [3]:
x = results[["experiment", "f1_raw", "f1_adj", "precision_adj", "recall_adj"]].sort_values(["f1_raw"], ascending=False)
x

Unnamed: 0,experiment,f1_raw,f1_adj,precision_adj,recall_adj
8,b'3sepsis-3_24-12_240-240.txt',0.696414,0.499464,0.591482,0.633428
9,b'3sepsis-3_48-24_168-168.txt',0.696101,0.500071,0.574852,0.632479
10,b'3sepsis-3_48-24_24-72.txt',0.694069,0.520099,0.572735,0.629946
13,b'4sepsis-3_48-24_168-168.txt',0.685881,0.584687,0.592239,0.626148
5,b'2sepsis-3_48-24_168-168.txt',0.684232,0.584406,0.588605,0.622032
4,b'2sepsis-3_24-12_240-240.txt',0.68208,0.602861,0.601708,0.625831
12,b'4sepsis-3_24-12_240-240.txt',0.682013,0.599358,0.598482,0.623932
11,b'3sepsis-3_6-3_1-3.txt',0.680844,0.578666,0.5782,0.611586
1,b'1_reyna_ffill_reyna_48-24_168-168.txt',0.677127,0.588511,0.58437,0.60842
0,b'1_reyna_ffill_reyna_24-12_240-240.txt',0.674993,0.602054,0.597679,0.612219


Sort by f1_adj: Standard strategies perform better.

In [4]:
# tn, fp, fn, tp
y = results[["experiment", "f1_raw", "f1_adj","cm","cm_adj"]].sort_values(["f1_adj"], ascending=False)
y

Unnamed: 0,experiment,f1_raw,f1_adj,cm,cm_adj
4,b'2sepsis-3_24-12_240-240.txt',0.68208,0.602861,[9758 797 3673 1615],[ 362 797 385 1615]
0,b'1_reyna_ffill_reyna_24-12_240-240.txt',0.674993,0.602054,[9833 722 3791 1497],[ 437 722 503 1497]
12,b'4sepsis-3_24-12_240-240.txt',0.682013,0.599358,[9746 809 3667 1621],[ 350 809 379 1621]
1,b'1_reyna_ffill_reyna_48-24_168-168.txt',0.677127,0.588511,[9756 799 3726 1562],[ 360 799 438 1562]
13,b'4sepsis-3_48-24_168-168.txt',0.685881,0.584687,[9662 893 3576 1712],[ 266 893 288 1712]
5,b'2sepsis-3_48-24_168-168.txt',0.684232,0.584406,[9675 880 3602 1686],[ 279 880 314 1686]
14,b'4sepsis-3_48-24_24-72.txt',0.671021,0.581566,[9795 760 3813 1475],[ 399 760 525 1475]
11,b'3sepsis-3_6-3_1-3.txt',0.680844,0.578666,[9686 869 3646 1642],[ 290 869 358 1642]
6,b'2sepsis-3_48-24_24-72.txt',0.656263,0.556288,[9866 689 4004 1284],[ 470 689 716 1284]
2,b'1_reyna_ffill_reyna_48-24_24-72.txt',0.648939,0.550171,[9940 615 4114 1174],[ 544 615 826 1174]


## Look at (false negatives) and find the reason

In [21]:
#get all false negatives for path
f = convert_result_to_df("./results/on_finetune_data/2sepsis-3_48-24_24-72.txt")
f['t_sus'] = f['t_sus'].str.replace(r'}', '')
sep_window = [48, 24]
sus_window = [24, 72]
# get ts_indexes of False and True Sepsis labels in results
noSeps = f.loc[f["Sepsis"] == "False"]
neg_hadm_IDs = list(map(int, noSeps["ts_ind"]))

adjusted_false_negatives = set(possible_pos_predictions).intersection(neg_hadm_IDs)
neg = map(str, adjusted_false_negatives)
fails = f.loc[f["ts_ind"].isin(neg)].reset_index()
IV = list(fails["t_IV"].unique())
cult = list(fails["t_cultures"].unique())

  f['t_sus'] = f['t_sus'].str.replace(r'}', '')


In [31]:
"""check why false negative happened: All patients here have antibiotics and blood cultures feature. 
It's either no sofa, no suspicion / too far apart -> fails sepsischeck, or IV / cultures too far apart -> fails suspicion check
default_to_fail must be 1.0, else something is wrong!"""
def fail_(t_sofa, t_cultures, t_IV, t_sus, sus_window, sep_window):
    no_sofa = False
    no_sus = False
    sus_too_late = False
    sofa_too_late = False
    cultures_too_late = False
    iv_too_late = False
    default_to_fail = False # use this to count if these fail reasons make up for all fails
    # no t_sofa means negative
    if t_sofa == "False" or False:
        no_sofa = True
        default_to_fail = True
    
    if t_sus == "False" or False:
        no_sus = True
        default_to_fail = True
    if t_IV == "False" or False:
        iv_too_late = True
        no_sus = True
        default_to_fail = True

    # culture too late means no t_sus means negative
    if not iv_too_late:
        if float(t_IV) < float(t_cultures) and float(t_cultures) - float(t_IV) > sus_window[0]:
            cultures_too_late = True
            no_sus = True
            default_to_fail = True
        # IV too late means no t_sus means negative
        if float(t_cultures) < float(t_IV) and float(t_IV) - float(t_cultures) > sus_window[1]:
            iv_too_late = True
            no_sus = True
            default_to_fail = True
    # t_sus too late means negative
    
    if not no_sus and not no_sofa:
        if float(t_sus) - float(t_sofa) > sep_window[0]:
            sus_too_late = True
            default_to_fail = True
        # t_sofa too late means negative
        if float(t_sofa) - float(t_sus) > sep_window[1]:
            sofa_too_late = True
            default_to_fail = True
        
    
    return no_sofa, no_sus, sus_too_late, sofa_too_late, cultures_too_late, iv_too_late, default_to_fail

In [32]:
df_FN = pd.DataFrame(index=range(len(fails)), columns=["no_sofa", "no_sus", "sus_too_late", "sofa_too_late", "cultures_too_late", "iv_too_late", "default_to_fail"])
for i, x in fails.iterrows():
    no_sofa, no_sus, sus_too_late, sofa_too_late, cultures_too_late, iv_too_late, default_to_fail = fail_(x["t_sofa"], x["t_cultures"], x["t_IV"], x["t_sus"], sus_window, sep_window)
    df_FN["no_sofa"][i] = no_sofa
    df_FN["no_sus"][i] = no_sus
    df_FN["sus_too_late"][i] = sus_too_late
    df_FN["sofa_too_late"][i] = sofa_too_late
    df_FN["cultures_too_late"][i] = cultures_too_late
    df_FN["iv_too_late"][i] = iv_too_late
    df_FN["default_to_fail"][i] = default_to_fail

In [33]:
# check that default_to_fail is 0
print(df_FN.loc[df_FN["default_to_fail"] == False])
print(fails.loc[df_FN["default_to_fail"] == False])

Empty DataFrame
Columns: [no_sofa, no_sus, sus_too_late, sofa_too_late, cultures_too_late, iv_too_late, default_to_fail]
Index: []
Empty DataFrame
Columns: [index, Subject ID, Hadm_ID, ts_ind, Sepsis, t_sepsis, t_sofa, t_cultures, t_IV, t_sus]
Index: []


As we can see, for ~81 % of patients that had necessary features and should have been predicted as septic but weren't, missing suspected infection was the reason. This led me to experiment with "catchsus" and "grouped" strategies. In the end I decided to still use the standard sepsis-3 implementation for further experiments, as that enables us to get better comparisons further down the line, and the other strategies did not significantly improve results (see results tables above).

In [34]:
no_sofa_ratio = list(df_FN["no_sofa"]).count(True) / len(df_FN)
no_sus_ratio = list(df_FN["no_sus"]).count(True) / len(df_FN)
sus_too_late_ratio = list(df_FN["sus_too_late"]).count(True) / len(df_FN)
sofa_too_late_ratio = list(df_FN["sofa_too_late"]).count(True) / len(df_FN)
cultures_too_late_ratio = list(df_FN["cultures_too_late"]).count(True) / len(df_FN)
iv_too_late_ratio = list(df_FN["iv_too_late"]).count(True) / len(df_FN)
default_to_fail_ratio = list(df_FN["default_to_fail"]).count(True) / len(df_FN)
print("No SOFA ratio:", no_sofa_ratio, "\nNo Suspected Infection ratio:",no_sus_ratio,"\nCultures too late ratio:",cultures_too_late_ratio,"\nIV too late ratio:",iv_too_late_ratio,"\nSuspected Infection too late ratio:",sus_too_late_ratio,"\nSOFA too late ratio:",sofa_too_late_ratio,"\ndefaults to negative ratio:",default_to_fail_ratio)

No SOFA ratio: 0.01675977653631285 
No Suspected Infection ratio: 0.8114525139664804 
Cultures too late ratio: 0.4692737430167598 
IV too late ratio: 0.34217877094972066 
Suspected Infection too late ratio: 0.15502793296089384 
SOFA too late ratio: 0.018156424581005588 
defaults to negative ratio: 1.0
