In [None]:
# NOTES:
# Why not use mixture of experts?

In [1]:
import os
import sys
import random
import pandas as pd
import numpy as np
from scipy.linalg import toeplitz
from copy import copy
import matplotlib.pyplot as plt
%matplotlib inline

# Geniuses that worked on hypertools did not update certain package and thus it produces warnings (they break jupyter lab)
import warnings
warnings.filterwarnings("ignore")

# Comment out if you don't want to see all of the values being printed (i.e. default)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

current_dir = os.getcwd()
# utils_path = os.path.join(current_dir, '..', 'utils')
utils_path = os.path.join(current_dir, '../')
utils_abs_path = os.path.abspath(utils_path)
if utils_abs_path not in sys.path:
    sys.path.append(utils_abs_path)

import utils.get_data as get_data
# from impute_methods import *
from utils.impute_methods import impute_linear_interpolation

DATA_PATH = get_data.get_dataset_abspath()

training_setA_path = DATA_PATH + 'training_setA'
training_setB_path = DATA_PATH + 'training_setB'

In [2]:
def plot_heart_rate_data(df):
    plt.figure(figsize=(10, 6))
    dataset['HR'].hist(bins=50)
    plt.title('Distribution of Heart Rate')
    plt.xlabel('Heart Rate')
    plt.ylabel('Frequency')
    plt.show()
    
    # You can also get a quick statistical summary
    print(dataset['HR'].describe())
    

In [3]:
# Loads the dataset

# Sepsis related test values / variables / columns
sep_col = ['BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST',
             'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
             'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
             'Bilirubin_total', 'Hct', 'Hgb', 'PTT', 'WBC', 'Platelets',
             'Bilirubin_direct', 'Fibrinogen']

# Continues Health Indicators
con_col = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2']

# The original way of getting data shouldn't work as there isn't a concept of individual patient file in it
# It just gets the data completely into a dataframe and each of the time data is one row
# dataset = get_data.get_dataset_as_df()

dataset, patient_id_map = get_data.get_dataset()

   20337
   40337
Dataset loaded into a MultiIndex DataFrame.


In [11]:
# for patient_id, file_name in patient_id_map.items():
#     print(type(dataset.loc[patient_id]))
#     break

Processing data for patient ID: 1.0, File: p017721.psv


In [23]:
columns_to_linearly_interpolate = [
    'HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Resp'
]
columns_to_ffill = [
    'Temp', 'Glucose', 'Potassium', 'Calcium', 
    'Magnesium', 'Chloride', 'Hct', 'Hgb', 'WBC', 'Platelets'
]
columns_to_drop = [
    'SepsisLabel', 'TroponinI'
]

In [28]:
def feature_missing_information(patient_data, columns):
    # temp_data holds the information from the patient file as well as the features that will be calculated
    temp_data = np.array(patient_data)

    # Calculate 3 features for each column, 2 respective of the frequency of NaN values and 1 respective of the change in recorded values
    for column in columns:
        data = np.array(patient_data[column])
        nan_pos = np.where(~np.isnan(data))[0]
        
        # Measurement frequency sequence
        interval_f1 = data.copy()
        # Measurement time interval
        interval_f2 = data.copy()

        # If all the values are NaN
        if (len(nan_pos) == 0):
            interval_f1[:] = 0
            temp_data = np.column_stack((temp_data, interval_f1))
            interval_f2[:] = -1
            temp_data = np.column_stack((temp_data, interval_f2))
        else :
            # Puts number of measurements into temp_data
            interval_f1[: nan_pos[0]] = 0
            for p in range(len(nan_pos)-1):
                interval_f1[nan_pos[p]: nan_pos[p+1]] = p + 1
            interval_f1[nan_pos[-1] :] = len(nan_pos)
            temp_data = np.column_stack((temp_data, interval_f1))

            # Puts the frequency of measurements into temp_data
            interval_f2[:nan_pos[0]] = -1
            for q in range(len(nan_pos) - 1):
                length = nan_pos[q+1] - nan_pos[q]
                for l in range(length):
                    interval_f2[nan_pos[q] + l] = l

            length = len(patient_data) - nan_pos[-1]
            for l in range(length):
                interval_f2[nan_pos[-1] + l] = l
            temp_data = np.column_stack((temp_data, interval_f2))

        # Differential features
        # These capture the change in values that have been recorded (quite simply as well but it should be just fine)
        diff_f = data.copy()
        diff_f = diff_f.astype(float)
        if len(nan_pos) <= 1:
            diff_f[:] = np.NaN
            temp_data = np.column_stack((temp_data, diff_f))
        else:
            diff_f[:nan_pos[1]] = np.NaN
            for p in range(1, len(nan_pos)-1):
                diff_f[nan_pos[p] : nan_pos[p+1]] = data[nan_pos[p]] - data[nan_pos[p-1]]
            diff_f[nan_pos[-1]:] = data[nan_pos[-1]] - data[nan_pos[-2]]
            temp_data = np.column_stack((temp_data, diff_f))
    
    return temp_data

In [37]:
def feature_slide_window(patient_data, columns):
    
    window_size = 6
    features = {}
    
    for column in columns:
        series = patient_data[column]

        features[f'{column}_max'] = series.rolling(window=window_size, min_periods=1).max()
        features[f'{column}_min'] = series.rolling(window=window_size, min_periods=1).min()
        features[f'{column}_mean'] = series.rolling(window=window_size, min_periods=1).mean()
        features[f'{column}_median'] = series.rolling(window=window_size, min_periods=1).median()
        features[f'{column}_std'] = series.rolling(window=window_size, min_periods=1).std()
        
        # For calculating std dev of differences, use diff() then apply rolling std
        diff_std = series.diff().rolling(window=window_size, min_periods=1).std()
        features[f'{column}_diff_std'] = diff_std

    # Convert the dictionary of features into a DataFrame
    features_df = pd.DataFrame(features)
    
    return features_df

In [45]:
def features_score(patient_data):
    """
    Gives score assocciated with the patient data according to the scoring systems of NEWS, SOFA and qSOFA
    """
    
    scores = np.zeros((len(patient_data), 8))
    
    for ii in range(len(patient_data)):
        HR = patient_data[ii, 0]
        if HR == np.nan:
            HR_score = np.nan
        elif (HR <= 40) | (HR >= 131):
            HR_score = 3
        elif 111 <= HR <= 130:
            HR_score = 2
        elif (41 <= HR <= 50) | (91 <= HR <= 110):
            HR_score = 1
        else:
            HR_score = 0
        scores[ii, 0] = HR_score

        Temp = patient_data[ii, 2]
        if Temp == np.nan:
            Temp_score = np.nan
        elif Temp <= 35:
            Temp_score = 3
        elif Temp >= 39.1:
            Temp_score = 2
        elif (35.1 <= Temp <= 36.0) | (38.1 <= Temp <= 39.0):
            Temp_score = 1
        else:
            Temp_score = 0
        scores[ii, 1] = Temp_score

        Resp = patient_data[ii, 6]
        if Resp == np.nan:
            Resp_score = np.nan
        elif (Resp < 8) | (Resp > 25):
            Resp_score = 3
        elif 21 <= Resp <= 24:
            Resp_score = 2
        elif 9 <= Resp <= 11:
            Resp_score = 1
        else:
            Resp_score = 0
        scores[ii, 2] = Resp_score

        Creatinine = patient_data[ii, 19]
        if Creatinine == np.nan:
            Creatinine_score = np.nan
        elif Creatinine < 1.2:
            Creatinine_score = 0
        elif Creatinine < 2:
            Creatinine_score = 1
        elif Creatinine < 3.5:
            Creatinine_score = 2
        else:
            Creatinine_score = 3
        scores[ii, 3] = Creatinine_score

        MAP = patient_data[ii, 4]
        if MAP == np.nan:
            MAP_score = np.nan
        elif MAP >= 70:
            MAP_score = 0
        else:
            MAP_score = 1
        scores[ii, 4] = MAP_score

        SBP = patient_data[ii, 3]
        Resp = patient_data[ii, 6]
        if SBP + Resp == np.nan:
            qsofa = np.nan
        elif (SBP <= 100) & (Resp >= 22):
            qsofa = 1
        else:
            qsofa = 0
        scores[ii, 5] = qsofa

        Platelets = patient_data[ii, 30]
        if Platelets == np.nan:
            Platelets_score = np.nan
        elif Platelets <= 50:
            Platelets_score = 3
        elif Platelets <= 100:
            Platelets_score = 2
        elif Platelets <= 150:
            Platelets_score = 1
        else:
            Platelets_score = 0
        scores[ii, 6] = Platelets_score

        Bilirubin = patient_data[ii, 25]
        if Bilirubin == np.nan:
            Bilirubin_score = np.nan
        elif Bilirubin < 1.2:
            Bilirubin_score = 0
        elif Bilirubin < 2:
            Bilirubin_score = 1
        elif Bilirubin < 6:
            Bilirubin_score = 2
        else:
            Bilirubin_score = 3
        scores[ii, 7] = Bilirubin_score
        
    return scores

In [46]:
def extract_features(patient_data):
    # Get the column with Sepsis Label as it is not the same for each row (check documentation)
    labels = np.array(patient_data['SepsisLabel'])
    patient_data = patient_data.drop(columns=columns_to_drop)

    # Gets information from the missing variables 
    # This can be useful as it shows the clinical judgment, the test has not been ordered 
    #                              (probably a good decision we should take into account)
    temp_data = feature_missing_information(patient_data, sep_col + con_col)
    temp = pd.DataFrame(temp_data)
    # To complete the data use forward-filling strategy
    temp = temp.fillna(method='ffill')
    # These are also the first set of features
    # In this configutation 99 (66 + 33 or 3 per column) features to be precise
    # They are also time indifferent
    print(temp)
    features_A = np.array(temp)
    print(features_A)
    # The team did not use DBP, not sure why, might investigate this
    # columns = ['HR', 'O2Sat', 'SBP', 'MAP', 'Resp', 'DBP']
    
    # six-hour slide window statistics of selected columns
    columns = ['HR', 'O2Sat', 'SBP', 'MAP', 'Resp']
    features_B = feature_slide_window(patient_data, columns)

    # Score features based according to NEWS, SOFA and qSOFA
    features_C = features_score(features_A)
    
    features = np.column_stack([features_A, features_B, features_C])
    
    return features, labels

In [47]:

frames_features = []
frames_labels = []

for patient_id in set(dataset.index.get_level_values(0)):
    patient_data = dataset.loc[patient_id]
    print(f"Processing data for patient ID: {patient_id}, File: {patient_id_map[patient_id]}", end='\r')

    features, labels = extract_features(patient_data)
    features = pd.DataFrame(features)
    labels = pd.DataFrame(labels)

    frames_features.append(features)
    frames_labels.append(labels)

dat_features = np.array(pd.concat(frames_features))
dat_labels = (np.array(pd.concat(frames_labels)))[:, 0]

# Randomly shuffle the data
index = [i for i in range(len(dat_labels))]
np.random.shuffle(index)
dat_features = dat_features[index]
dat_labels = dat_labels[index]

return dat_features, dat_labels

     0      1      2       3      4     5     6    7    8     9    ...    128  \
0   88.0  100.0  35.60  117.00  73.00  53.0   8.5  NaN -1.5   NaN  ...    NaN   
1   87.5  100.0  35.65  112.00  69.50  51.0  12.0  NaN -3.0   NaN  ...  -3.50   
2   88.0   99.0  36.20  103.00  65.00  49.0  14.0  NaN -3.0   NaN  ...  -4.50   
3   88.0   95.5  36.75   93.50  71.50  38.5  19.0  NaN  0.0   NaN  ...   6.50   
4   88.0   96.0  37.00  112.50  65.50  46.5  18.5  NaN  0.0   NaN  ...  -6.00   
5   88.0   99.0  37.05  123.00  73.00  51.5  15.0  NaN  0.0   NaN  ...   7.50   
6   88.0   99.0  36.95  103.00  58.50  40.5  12.0  NaN  0.0   NaN  ... -14.50   
7   88.0  100.0  37.00  133.00  78.00  52.0  16.0  NaN  0.0   NaN  ...  19.50   
8   88.0  100.0  36.90  100.00  61.00  44.0  12.0  NaN  0.0   NaN  ... -17.00   
9   88.0  100.0  36.90   96.00  58.00  42.0  12.0  NaN  0.0   NaN  ...  -3.00   
10  88.0   99.5  36.95   91.50  56.00  41.0  12.0  NaN  0.0   NaN  ...  -2.00   
11  88.0  100.0  36.80  112.

In [4]:
# DATA PREPROCESSING

# Imputes O2Sat using linear interpolation
# Other methods might be better based on the data distribution (consider Spline or Polynomial Interpolation)

# Impute SBP using linear interpolation
# We can consider Forward Fill or Backward Fill if we assume the blood pressure should remain relatively stable

# Impute MAP using linear interpolation
# To be more sophiscticated the data can be imputed with custom models to take into account SBP and DBP as there might be correlation

# Impute DBP using linear interpolation
# Same as SBP, we might consider Spline

# Impute Resp using linear interpolation
# Same as SBP and DBP, we might consider Spline or Polynomial Interpolation


'''
columns_to_linearly_interpolate = [
    'HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Resp'
]
for column in columns_to_linearly_interpolate:
    dataset = impute_linear_interpolation(dataset, column)
    print('Finished imputing ' + column)

columns_to_ffill = [
    'Temp', 'Glucose', 'Potassium', 'Calcium', 
    'Magnesium', 'Chloride', 'Hct', 'Hgb', 'WBC', 'Platelets'
]
for column in columns_to_ffill:
    dataset[column].ffill(inplace=True)
'''

# Columns not imputed

# They dropped Bilirubin_direct, TroponinI, Fibrinogen
#            has relation, more complex if any, potentially





# dataset = dataset.fillna

# Use forward filling for some of the data

# Best solution used sliding window

Finished imputing HR
Finished imputing O2Sat
Finished imputing SBP
Finished imputing MAP
Finished imputing DBP
Finished imputing Resp


In [8]:
def feature_missing_information(patient_data, columns):
    temp = patiend_data
    
    return temp

In [9]:
for patient_id in set(dataset.index.get_level_values(0)):
    patient_data = multiindex_df.loc[patient_id]
    print(f"Processing data for patient ID: {patient_id}, File: {patient_id_map[patient_id]}")

HR
O2Sat
Temp
SBP
MAP
DBP
Resp
EtCO2
BaseExcess
HCO3
FiO2
pH
PaCO2
SaO2
AST
BUN
Alkalinephos
Calcium
Chloride
Creatinine
Bilirubin_direct
Glucose
Lactate
Magnesium
Phosphate
Potassium
Bilirubin_total
TroponinI
Hct
Hgb
PTT
WBC
Fibrinogen
Platelets
Age
Gender
Unit1
Unit2
HospAdmTime
ICULOS
SepsisLabel


In [7]:
# Function to ectract features

def extract_features(patient_data):
    # Get the column with Sepsis Label as it is not the same for each row (check documentation)
    labels = np.array(patient_data['SepsisLabel'])
    patient_data = patient_data.drop(columns=['SepsisLabel', 'TroponinI'])

    # Gets information from the missing variables 
    # This can be useful as it shows the clinical judgment, the test has not been ordered 
    #                              (probably a good decision we should take into account)
    temp_data = feature_missing_information(patient_data, sep_col + con_col)
    temp = pd.DataFrame(temp_data)
    # To complete the data use forward-filling strategy
    temp = temp.fillna(method='ffill')
    # These are also the first set of features
    # In this configutation 66 features to be precise
    # They are also time indifferent
    features_A = np.array(temp)
    

    # some code
    # Forward-Filling missing values
    data = data.fillna(method='ffill')
    
    return features, labels

In [4]:
# Run feature extraction for each patient

for c in dataset.columns:
    print(c)

AttributeError: 'function' object has no attribute 'columns'