# Import Data and Clean/Impute

In [2]:
import pandas as pd
import numpy as npb
from tqdm import tqdm
import pickle
import statsmodels.imputation.mice as smi
pd.set_option('display.max_columns', 500)

Read in files from train, test validation sets

In [438]:
import glob
import errno
path = 'ICU_Mini/*.txt'
mini_files = glob.glob(path)

In [445]:
import glob
import errno
path = 'ICU_Data/*.txt'
files = glob.glob(path)

In [451]:
path = 'Test_Data/*.txt'
test_files = glob.glob(path)

In [452]:
path = 'Validation_Data/*.txt'
val_files = glob.glob(path)

In [5]:
# Read in a mountain of text data!
def read_mountain_data(files):
    counts = pd.DataFrame()
    vital_means = pd.DataFrame()
    vital_std = pd.DataFrame()
    with tqdm(total=100) as pbar:
        for file in sorted(files):
            with open(file) as f:
                content = f.readlines()
            contents = [i.rstrip().split(',') for i in content]
            data = pd.DataFrame(contents[1:])
            data[2] = data[2].astype(float)
            counts = counts.append(data.drop(columns = 2).groupby(1).count().transpose(), sort=True)
            vital_means = vital_means.append(data.groupby(1).mean().transpose(), sort=True)
            pbar.update(.025)
        return [vital_means, counts]

In [446]:
# Read in standard deviations, and clean data in process
def read_standard_deviations(files):
    vital_std = pd.DataFrame()
    with tqdm(total=100) as pbar:
        for file in sorted(files):
            with open(file) as f:
                content = f.readlines()
            contents = [i.rstrip().split(',') for i in content]
            del contents[0]
            filtered = [x for x in contents if not (((x[1] == 'NIDiasABP') & (float(x[2]) < 10.))
                                        |((x[1] == 'NISysABP') & (float(x[2]) < 10.))
                                        |((x[1] == 'NIMAP') & (float(x[2]) < 10.)) 
                                       |((x[1] == 'Height') & (float(x[2]) < 90.))
                                       |((x[1] == 'Weight') & (float(x[2]) < 30.))
                                       |((x[1] == 'Gender') & (float(x[2]) < 0.)))]
            data = pd.DataFrame(filtered)
            data[2] = data[2].astype(float)
            vital_std = vital_std.append(data.groupby(1).std().transpose(), sort=True)
            pbar.update(.025)
        return vital_std

In [15]:
data = read_mountain_data(files)
train_data = data[0]
counts = data[2]

100.00000000000561it [01:50,  1.08it/s]                          


In [8]:
data = read_mountain_data(test_files)
test_means = data[0]

100.00000000000561it [02:09,  2.81s/it]                          


In [9]:
data = read_mountain_data(val_files)
val_means = data[0]

100.00000000000561it [01:53,  1.13s/it]                          


In [447]:
train_std = read_standard_deviations(files)

100.00000000000561it [00:41,  2.39it/s]                          


In [453]:
test_std = read_standard_deviations(test_files)
val_std = read_standard_deviations(val_files)

100.00000000000561it [00:40,  2.47it/s]                          
100.00000000000561it [00:52,  1.89it/s]                          


In [458]:
val_std = val_std.drop(columns = '');

In [3]:
pickle_in = open("all_feature_means.pickle","rb")
train_data = pickle.load(pickle_in)
pickle_in = open("test_data.pickle","rb")
test_data = pickle.load(pickle_in)
pickle_in = open("val_data.pickle","rb")
val_data = pickle.load(pickle_in)

More cleaning of impossible values: 0 Blood pressure (all patients were alive during the 48 hours, so it must simply mean a machine was disconnected).
Also, all patients were adults (at least 15 years old), so removing impossible weights/heights (under 80 cm and under 30 kg).

In [26]:
train_data.loc[(train_data.Height < 90),'Height'] = np.NaN
train_data.loc[(train_data.Gender < 0),'Gender'] = np.NaN
train_data.loc[(train_data.Weight < 30),'Weight'] = np.NaN
test_data.loc[(test_data.Height < 90),'Height'] = np.NaN
test_data.loc[(test_data.Gender < 0),'Gender'] = np.NaN
test_data.loc[(test_data.Weight < 30),'Weight'] = np.NaN
val_data.loc[(val_data.Height < 90),'Height'] = np.NaN
val_data.loc[(val_data.Gender < 0),'Gender'] = np.NaN
val_data.loc[(val_data.Weight < 30),'Weight'] = np.NaN

In [27]:
#Dropping MechVent, the training data was identical for all patients
train_data = train_data.drop(columns = 'MechVent')
test_data = test_data.drop(columns = 'MechVent')
val_data = val_data.drop(columns = 'MechVent')
val_data = val_data.drop(columns = '')

In [28]:
#Merging Invasive/Noninvasive BP's by keeping invasive if available, replacing with NI only when necessary
bp = ['DiasABP','SysABP','MAP']
NIbp = ['NIDiasABP','NISysABP','NIMAP']

def merge_BP(df):
    for index, col in enumerate(bp):
        df[col].fillna(df[NIbp[index]], inplace=True)
        del df[NIbp[index]]

In [29]:
merge_BP(train_data)
merge_BP(test_data)
merge_BP(val_data)
merge_BP(train_std)
merge_BP(test_std)
merge_BP(val_std)

In [30]:
#Removing impossibly low BP values while at it
train_data.loc[(train_data.DiasABP < 10),'DiasABP'] = np.NaN
train_data.loc[(train_data.SysABP < 10),'SysABP'] = np.NaN
train_data.loc[(train_data.MAP < 10),'MAP'] = np.NaN
test_data.loc[(test_data.DiasABP < 10),'DiasABP'] = np.NaN
test_data.loc[(test_data.SysABP < 10),'SysABP'] = np.NaN
test_data.loc[(test_data.MAP < 10),'MAP'] = np.NaN
val_data.loc[(val_data.DiasABP < 10),'DiasABP'] = np.NaN
val_data.loc[(val_data.SysABP < 10),'SysABP'] = np.NaN
val_data.loc[(val_data.MAP < 10),'MAP'] = np.NaN

In [31]:
#Read in outcomes and pretty them up for use
def find_outcomes(text):
    with open(text) as f:
        content = f.readlines()
        contents = [i.rstrip().split(',') for i in content[1:]]
    outcomes = [[i[0],i[5]] for i in contents]
    outcomes = pd.DataFrame(outcomes)
    outcomes = outcomes.rename(columns = {0:'RecordID',1:'Outcome'})
    outcomes = outcomes.astype(int)
    return outcomes

In [32]:
train_outcomes = find_outcomes('Outcomes.txt')
test_outcomes = find_outcomes('outcomes_test.txt')
val_outcomes = find_outcomes('outcomes_val.txt')

In [53]:
#Reducing Features down
categories = ['Age','BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3',
       'HCT', 'HR', 'ICUType', 'K', 'MAP', 'Mg',
       'Na', 'PaCO2', 'PaO2', 'Platelets','Gender',
       'RecordID', 'SysABP', 'Temp', 'Urine', 'WBC', 'Weight', 'pH']
stdgories = ['BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS','Glucose', 'HCO3', 'HCT', 'HR','K',
       'MAP','Mg', 'Na',
       'PaCO2', 'PaO2', 'Platelets', 'RecordID','SysABP',
       'Temp', 'Urine', 'WBC', 'pH']

In [54]:
for_impute = train_data[categories]
for_impute.RecordID = for_impute.RecordID.astype(int)
for_imptest = test_data[categories]
for_imptest.RecordID = for_imptest.RecordID.astype(int)
for_impval = val_data[categories]
for_impval.RecordID = for_impval.RecordID.astype(int)

In [48]:
pickle_out = open("for_impute","wb")
pickle.dump(for_impute, pickle_out)
pickle_out.close()
pickle_out = open("for_imptest","wb")
pickle.dump(for_imptest, pickle_out)
pickle_out.close()
pickle_out = open("for_impval","wb")
pickle.dump(for_impval, pickle_out)
pickle_out.close()

In [486]:
for_impstd = train_std[stdgories]
for_impstd['RecordID'] = for_impute['RecordID']
for_impstd.columns = for_impstd.columns.map(lambda x: str(x) + '_std')
for_impstd = for_impstd.rename(columns = {'RecordID_std':'RecordID'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [487]:
for_teststd = test_std[stdgories]
for_teststd['RecordID'] = for_imptest['RecordID']
for_teststd.columns = for_teststd.columns.map(lambda x: str(x) + '_std')
for_teststd = for_teststd.rename(columns = {'RecordID_std':'RecordID'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [488]:
for_valstd = val_std[stdgories]
for_valstd['RecordID'] = for_impval['RecordID']
for_valstd.columns = for_valstd.columns.map(lambda x: str(x) + '_std')
for_valstd = for_valstd.rename(columns = {'RecordID_std':'RecordID'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Impute data with means

In [55]:
regtrain = pd.merge(for_impute.fillna(for_impute.mean()),train_outcomes, on='RecordID')
regtest = pd.merge(for_imptest.fillna(for_imptest.mean()),test_outcomes, on='RecordID')
regval = pd.merge(for_impval.fillna(for_impval.mean()),val_outcomes, on='RecordID')

In [56]:
pickle_out = open("regtrain.pickle","wb")
pickle.dump(regtrain, pickle_out)
pickle_out.close()
pickle_out = open("regtest.pickle","wb")
pickle.dump(regtest, pickle_out)
pickle_out.close()
pickle_out = open("regval.pickle","wb")
pickle.dump(regval, pickle_out)
pickle_out.close()

### Use MICE to impute missing data

In [19]:
pickle_in = open("for_impute","rb")
for_impute = pickle.load(pickle_in)
pickle_in = open("for_imptest","rb")
for_imptest = pickle.load(pickle_in)
pickle_in = open("for_impval","rb")
for_impval = pickle.load(pickle_in)

In [49]:
#Use MICE to impute date
def impute(data, outcomes):
    imp = smi.MICEData(pd.merge(data, outcomes, on='RecordID'))
    imp.update_all(20)
    impute = imp.data
    return impute

In [50]:
imputed = impute(for_impute, train_outcomes)

In [51]:
imputed_test = impute(for_imptest, test_outcomes)
imputed_val = impute(for_impval, val_outcomes)

In [52]:
pickle_out = open("imputed.pickle","wb")
pickle.dump(imputed, pickle_out)
pickle_out.close()
pickle_out = open("imputed_test.pickle","wb")
pickle.dump(imputed_test, pickle_out)
pickle_out.close()
pickle_out = open("imputed_val.pickle","wb")
pickle.dump(imputed_val, pickle_out)
pickle_out.close()

In [498]:
imputed_std = impute(for_impstd, train_outcomes)
imp_std_test = impute(for_teststd, test_outcomes)
imp_std_val = impute(for_valstd, val_outcomes)

In [499]:
pickle_out = open("imputed_std.pickle","wb")
pickle.dump(imputed_std, pickle_out)
pickle_out.close()
pickle_out = open("imp_std_test.pickle","wb")
pickle.dump(imp_std_test, pickle_out)
pickle_out.close()
pickle_out = open("imp_std_val.pickle","wb")
pickle.dump(imp_std_val, pickle_out)
pickle_out.close()

In [4]:
pickle_in = open("imputed_std.pickle","rb")
imputed_std = pickle.load(pickle_in)
pickle_in = open("imp_std_test.pickle","rb")
imp_std_test = pickle.load(pickle_in)
pickle_in = open("imp_std_val.pickle","rb")
imp_std_val = pickle.load(pickle_in)

In [75]:
stdgories = ['HR_std','WBC_std','RecordID','Outcome']

In [77]:
all_train = pd.merge(imputed.drop(columns = 'Outcome'), imputed_std[stdgories], on='RecordID')
all_test = pd.merge(imputed_test.drop(columns = 'Outcome'), imp_std_test[stdgories], on='RecordID')
all_val = pd.merge(imputed_val.drop(columns = 'Outcome'), imp_std_val[stdgories], on='RecordID')

In [78]:
pickle_out = open("all_train.pickle","wb")
pickle.dump(all_train, pickle_out)
pickle_out.close()
pickle_out = open("all_test.pickle","wb")
pickle.dump(all_test, pickle_out)
pickle_out.close()
pickle_out = open("all_val.pickle","wb")
pickle.dump(all_val, pickle_out)
pickle_out.close()

In [5]:
pickle_in = open("all_train.pickle","rb")
all_train = pickle.load(pickle_in)
pickle_in = open("all_test.pickle","rb")
all_test = pickle.load(pickle_in)
pickle_in = open("all_val.pickle","rb")
all_val = pickle.load(pickle_in)

In [6]:
all_train.columns

Index(['Age', 'BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3',
       'HCT', 'HR', 'ICUType', 'K', 'MAP', 'Mg', 'Na', 'PaCO2', 'PaO2',
       'Platelets', 'Gender', 'RecordID', 'SysABP', 'Temp', 'Urine', 'WBC',
       'Weight', 'pH', 'HR_std', 'WBC_std', 'Outcome'],
      dtype='object')

In [18]:
pickle_out = open("reduced_train","wb")
pickle.dump(reduced_train, pickle_out)
pickle_out.close()
pickle_out = open("reduced_test","wb")
pickle.dump(reduced_test, pickle_out)
pickle_out.close()
pickle_out = open("reduced_val","wb")
pickle.dump(reduced_val, pickle_out)
pickle_out.close()

### Working with Anomaly Data

In [94]:
pickle_in = open("all_importance_train","rb")
anomaly_tra = pickle.load(pickle_in)
pickle_in = open("all_importance_test","rb")
anomaly_tes = pickle.load(pickle_in)
pickle_in = open("all_importance_val","rb")
anomaly_val = pickle.load(pickle_in)

In [97]:
#Merging Invasive/Noninvasive BP's by keeping invasive if available, replacing with NI only when necessary
bp = ['DiasABP_I','SysABP_I','MAP_I','DiasABP_A','SysABP_A','MAP_A']
NIbp = ['NIDiasABP_I','NISysABP_I','NIMAP_I','NIDiasABP_A','NISysABP_A','NIMAP_A']

def merge_BP(df):
    for index, col in enumerate(bp):
        df[col].fillna(df[NIbp[index]], inplace=True)
        del df[NIbp[index]]

In [98]:
merge_BP(anomaly_tr)
merge_BP(anomaly_tes)

In [100]:
anomaly_tr = anomaly_tra.reset_index().rename(columns = {'index':'RecordID'}).replace(float(-1),np.NaN)
anomaly_te = anomaly_tes.reset_index().rename(columns = {'index':'RecordID'}).replace(float(-1),np.NaN)

In [101]:
anomaly_tr = anomaly_tr.replace(np.inf,np.NaN)
anomaly_te = anomaly_te.replace(np.inf,np.NaN)

In [102]:
anomaly_train = impute(anomaly_tr, train_outcomes)
anomaly_test = impute(anomaly_te, test_outcomes)

In [103]:
pickle_out = open("anomaly_train.pickle","wb")
pickle.dump(anomaly_train, pickle_out)
pickle_out.close()
pickle_out = open("anomaly_test.pickle","wb")
pickle.dump(anomaly_test, pickle_out)
pickle_out.close()

In [104]:
anomgories = ['DiasABP_A', 'MAP_A','RecordID','Outcome']

In [105]:
final_train = pd.merge(all_train.drop(columns = 'Outcome'), anomaly_train[anomgories], on='RecordID')
final_test = pd.merge(all_test.drop(columns = 'Outcome'), anomaly_test[anomgories], on='RecordID')

In [106]:
pickle_out = open("final_train.pickle","wb")
pickle.dump(final_train, pickle_out)
pickle_out.close()
pickle_out = open("final_test.pickle","wb")
pickle.dump(final_test, pickle_out)
pickle_out.close()

In [108]:
all_all = pd.concat([all_train, all_test])

In [109]:
pickle_out = open("mega_all","wb")
pickle.dump(all_all, pickle_out)