In [7]:
import pandas as pd

### Take a look at the results of sepsis check on patients

In [8]:
def filter_data (data, str_column, value):
    """given data, string of column and string of value, return masked data that meets the requirements"""
    m = data[str_column] == value
    t = data[m]
    return t

In [9]:
def convert_result_to_df(filename):
    result_dict = []
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            line_list = line.split()
            line_list = [s.strip(',') for s in line_list]
            line_list = [s.replace(',', '') for s in line_list]
            result_dict.append(dict({'Subject ID': line_list[2], 
                                        'Hadm_ID': line_list[4],
                                        'ts_ind': line_list[6],
                                        'Sepsis': line_list[8], 
                                        't_sofa': line_list[10], 
                                        't_cultures': line_list[12], 
                                        't_IV': line_list[14], 
                                        't_sus': line_list[16]}))
    df = pd.DataFrame.from_records(result_dict)

    return df

In [10]:
path = "./data/patient/mimic_iii_preprocessed_finetuning2.pkl"
output = "results.txt"
data = pd.read_pickle(path)

### Accuracy of predicted and actual septic patients: 12 - 20 % depending on ffill
However, more than half of the actual septic patients do not contain features for Blood Cultures or Antibiotics, thus yielding a negative result by default.

In [11]:
df = convert_result_to_df('results/withts_ind/results_ffill.txt')
m = df["Sepsis"] == 'True'
pos_hadm_IDs = list(map(int, df[m]["ts_ind"]))
df2 = convert_result_to_df("results/withts_ind/results_no_ffill.txt") ## all ts_indexes with sepsis True
m2 = df2["Sepsis"] == "True"
pos_hadm_IDs2 = list(map(int, df2[m2]["ts_ind"]))
h = data[1]["in_hospital_sepsis"] == True
sepsis_pos = list(map(int, data[1][h]["ts_ind"])) # all ts_indexes with sepsis True in full data
print("Accuracy of ffill on Antibiotics:", len(set(pos_hadm_IDs).intersection(sepsis_pos)) / len(sepsis_pos))
print("Accuracy without ffill on Antibiotics:", len(set(pos_hadm_IDs2).intersection(sepsis_pos)) / len(sepsis_pos))


dif1 = set(sepsis_pos).difference(pos_hadm_IDs) # hadm_ids that are in the positive list of the full data set but not detected by the sepsis check with ffill on antibiotics
dif2 = set(sepsis_pos).difference(pos_hadm_IDs2)
l = list(dif1)

#store variables of the false negatives in list and check if Antibiotics and Blood Cultures are in -> i guess those are the most common reasons for a negative result
are_in = []
arent_in_indexes = []
for i, x in enumerate(l):
    mask = data[0]["ts_ind"] == l[i]
    d = data[0][mask]
    l_of_variables = d["variable"].unique()
    if "Blood Culture" and "Antibiotics" in l_of_variables:
        are_in.append("True")
    else:
        are_in.append("False")
        arent_in_indexes.append(x)

print(are_in.count("False"), len(arent_in_indexes))

Accuracy of ffill on Antibiotics: 0.20366868381240544
Accuracy without ffill on Antibiotics: 0.12708018154311648
2830 2830


### Adjusted accuracy with impossible cases removed: ~43 %
#### -> Get septic patients that have all relevant features but still where predicted as negative


In [12]:
have_all_features = [x for x in sepsis_pos if x not in arent_in_indexes]
print(len(arent_in_indexes), "Septic patients that do not have relevant features:", (len(arent_in_indexes) / len(sepsis_pos)))
print("Accuracy with impossible cases removed (no Blood Cultures or Antibiotics features):", len(set(pos_hadm_IDs).intersection(have_all_features)) / len(have_all_features))
false_negatives = set(have_all_features).difference(pos_hadm_IDs)
print("Number of false negatives, even though all features were present:", len(false_negatives))

2830 Septic patients that do not have relevant features: 0.5351739788199698
Accuracy with impossible cases removed (no Blood Cultures or Antibiotics features): 0.43816110659072416
Number of false negatives, even though all features were present: 1381


### TODO: analyze those patients and try to find the reasons for wrong predictions