<a href="https://colab.research.google.com/github/JulianNeff/IML2020/blob/master/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Imports
import pandas as pd
import numpy as np
import random
from random import shuffle
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
import sys
import numpy
from tensorflow.keras import layers
from tensorflow import keras 
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

numpy.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)

In [0]:
# helper functions

def get_unique_pids(featuresdf):
    return featuresdf['pid'].unique()

# aggregate data for all features
def aggregate_all(featuresdf):
    cc = featuresdf.groupby(['pid']).cumcount()+1
    featuresdf = featuresdf.set_index(['pid', cc]).unstack()
    featuresdf.columns = ['_'.join(map(str,i)) for i in featuresdf.columns]
    featuresdf = featuresdf.reset_index()
    return featuresdf

def featureselect(featuresdf, featurenames):
    pids = get_unique_pids(featuresdf)
    featurenamesnr = []
    for name in featurenames:
        for i in range(1,13):
            featurenamesnr.append(name + '_' + str(i))
    newfeatures = pd.DataFrame(columns = featurenamesnr)
    for pid in pids:
        newfeatures = newfeatures.append(featuresdf.loc[featuresdf['pid'] == pid][featurenamesnr])
    return newfeatures

In [0]:
# Read Input
max_patients = 18995
num_patients = max_patients
features = pd.read_csv('train_features.csv', nrows=num_patients*12)
labels = pd.read_csv('train_labels.csv', nrows=num_patients)

In [0]:
#features.info()
#features.describe()
#features.isnull().sum()

In [0]:
#####################
## data imputation ##
#####################

pids = get_unique_pids(features)

# change medical tests to boolean values
features['BaseExcessTest'] = features['BaseExcess'].notnull().astype('int')
features['EtCO2Test'] = features['EtCO2'].notnull().astype('int')
features['PTTTest'] = features['PTT'].notnull().astype('int')
features['BUNTest'] = features['BUN'].notnull().astype('int')
features['LactateTest'] = features['Lactate'].notnull().astype('int')
features['HgbTest'] = features['Hgb'].notnull().astype('int')
features['HCO3Test'] = features['HCO3'].notnull().astype('int')
features['BaseExcessTest'] = features['BaseExcess'].notnull().astype('int')
features['FibrinogenTest'] = features['Fibrinogen'].notnull().astype('int')
features['PhosphateTest'] = features['Phosphate'].notnull().astype('int')
features['WBCTest'] = features['WBC'].notnull().astype('int')
features['CreatinineTest'] = features['Creatinine'].notnull().astype('int')
features['PaCO2Test'] = features['PaCO2'].notnull().astype('int')
features['ASTTest'] = features['AST'].notnull().astype('int')
features['FiO2Test'] = features['FiO2'].notnull().astype('int')
features['PlateletsTest'] = features['Platelets'].notnull().astype('int')
features['SaO2Test'] = features['SaO2'].notnull().astype('int')
features['GlucoseTest'] = features['Glucose'].notnull().astype('int')
features['MagnesiumTest'] = features['Magnesium'].notnull().astype('int')
features['PotassiumTest'] = features['Potassium'].notnull().astype('int')
features['CalciumTest'] = features['Calcium'].notnull().astype('int')
features['AlkalinephosTest'] = features['Alkalinephos'].notnull().astype('int')
features['Bilirubin_directTest'] = features['Bilirubin_direct'].notnull().astype('int')
features['ChlorideTest'] = features['Chloride'].notnull().astype('int')
features['HctTest'] = features['Hct'].notnull().astype('int')
features['Bilirubin_totalTest'] = features['Bilirubin_total'].notnull().astype('int')
features['TroponinITest'] = features['TroponinI'].notnull().astype('int')
features['pHTest'] = features['pH'].notnull().astype('int')


# interpolate
for currentpid in pids:
    currentfeatures = features.loc[features['pid'] == currentpid]
    #features.loc[features['pid'] == currentpid] = currentfeatures.fillna(currentfeatures.mean())
    features.loc[features['pid'] == currentpid] = currentfeatures.interpolate(limit_direction='both')

# for patients without any value, take overall mean
features = features.fillna(features.mean())

#normalize
features=pd.concat([features.iloc[:,:1],(features.iloc[:,1:]-features.iloc[:,1:].mean())/features.iloc[:,1:].std()], axis=1)

In [0]:
# one line per patient
features = aggregate_all(features)
del features['Age_2']
del features['Age_3']
del features['Age_4']
del features['Age_5']
del features['Age_6']
del features['Age_7']
del features['Age_8']
del features['Age_9']
del features['Age_10']
del features['Age_11']
del features['Age_12']

In [0]:
# split into DETERMINISTIC training and test data

ratio = 1

pids = get_unique_pids(features).tolist()
train_pids = pids[:int(ratio*num_patients)]
test_pids = pids[int(ratio*num_patients):]

X_train = features.iloc[0:0]
Y_train = labels.iloc[0:0]
X_test = features.iloc[0:0]
Y_test = labels.iloc[0:0]

for pid in train_pids:
    X_train = X_train.append(features.loc[features['pid'] == pid])
    Y_train = Y_train.append(labels.loc[labels['pid'] == pid])
    
for pid in test_pids:
    X_test = X_test.append(features.loc[features['pid'] == pid])
    Y_test = Y_test.append(labels.loc[labels['pid'] == pid])
    
X_train = X_train.sort_values('pid')
Y_train = Y_train.sort_values('pid')
X_test = X_test.sort_values('pid')
Y_test = Y_test.sort_values('pid')

X_train = X_train.iloc[:,1:]
X_test = X_test.iloc[:,1:]

In [0]:
# split into RANDOM training and test data
pids = get_unique_pids(features).tolist()
shuffle(pids)
train_pids = pids[:int(0.8*num_patients)]
test_pids = pids[int(0.8*num_patients):]

X_train = features.iloc[0:0]
Y_train = labels.iloc[0:0]
X_test = features.iloc[0:0]
Y_test = labels.iloc[0:0]

for pid in train_pids:
    X_train = X_train.append(features.loc[features['pid'] == pid])
    Y_train = Y_train.append(labels.loc[labels['pid'] == pid])
    
for pid in test_pids:
    X_test = X_test.append(features.loc[features['pid'] == pid])
    Y_test = Y_test.append(labels.loc[labels['pid'] == pid])
    
X_train = X_train.sort_values('pid')
Y_train = Y_train.sort_values('pid')
X_test = X_test.sort_values('pid')
Y_test = Y_test.sort_values('pid')

X_train = X_train.iloc[:,1:]
X_test = X_test.iloc[:,1:]

In [0]:
########################
## TRAINING SUBTASK 1 ##
########################

In [0]:
clf_BaseExcess = SVC(probability=True).fit(X_train,Y_train.LABEL_BaseExcess)
clf_Fibrinogen = SVC(probability=True).fit(X_train,Y_train.LABEL_Fibrinogen)
clf_AST = SVC(probability=True).fit(X_train,Y_train.LABEL_AST)
clf_Alkalinephos = SVC(probability=True).fit(X_train,Y_train.LABEL_Alkalinephos)
clf_Bilirubin_total = SVC(probability=True).fit(X_train,Y_train.LABEL_Bilirubin_total)
clf_Lactate = SVC(probability=True).fit(X_train,Y_train.LABEL_Lactate)
clf_TroponinI = SVC(probability=True).fit(X_train,Y_train.LABEL_TroponinI)
clf_SaO2 = SVC(probability=True).fit(X_train,Y_train.LABEL_SaO2)
clf_Bilirubin_direct = SVC(probability=True).fit(X_train,Y_train.LABEL_Bilirubin_direct)
clf_EtCO2 = SVC(probability=True).fit(X_train,Y_train.LABEL_EtCO2)

KeyboardInterrupt: ignored

In [0]:
roc_BaseExcess = roc_auc_score(Y_test.LABEL_BaseExcess, clf_BaseExcess.predict_proba(X_test)[:,1])
roc_Fibrinogen = roc_auc_score(Y_test.LABEL_Fibrinogen, clf_Fibrinogen.predict_proba(X_test)[:,1])
roc_AST = roc_auc_score(Y_test.LABEL_AST, clf_AST.predict_proba(X_test)[:,1])
roc_Alkalinephos = roc_auc_score(Y_test.LABEL_Alkalinephos, clf_Alkalinephos.predict_proba(X_test)[:,1])
roc_Bilirubin_total = roc_auc_score(Y_test.LABEL_Bilirubin_total, clf_Bilirubin_total.predict_proba(X_test)[:,1])
roc_Lactate = roc_auc_score(Y_test.LABEL_Lactate, clf_Lactate.predict_proba(X_test)[:,1])
roc_TroponinI = roc_auc_score(Y_test.LABEL_TroponinI, clf_TroponinI.predict_proba(X_test)[:,1])
roc_SaO2 = roc_auc_score(Y_test.LABEL_SaO2, clf_SaO2.predict_proba(X_test)[:,1])
roc_Bilirubin_direct = roc_auc_score(Y_test.LABEL_Bilirubin_direct, clf_Bilirubin_direct.predict_proba(X_test)[:,1])
roc_EtCO2 = roc_auc_score(Y_test.LABEL_EtCO2, clf_EtCO2.predict_proba(X_test)[:,1])
print('BaseExcess: ' + str(roc_BaseExcess))
print('Fibrinogen: ' + str(roc_Fibrinogen))
print('AST: ' + str(roc_AST))
print('Alkalinephos: ' + str(roc_Alkalinephos))
print('Bilirubin_total: ' + str(roc_Bilirubin_total))
print('Lactate: ' + str(roc_Lactate))
print('TroponinI: ' + str(roc_TroponinI))
print('SaO2: ' + str(roc_SaO2))
print('Bilirubin_direct: ' + str(roc_Bilirubin_direct))
print('EtCO2: ' + str(roc_EtCO2))
print('Total: ' + str((roc_BaseExcess+roc_Fibrinogen+roc_AST+roc_Alkalinephos+roc_Bilirubin_total+roc_Lactate+roc_TroponinI+roc_SaO2+roc_Bilirubin_direct+roc_EtCO2)/10))

In [0]:
###################################

In [0]:
########################
## TRAINING SUBTASK 2 ##
########################

(4000, 745)

In [0]:
model = Sequential()
model.add(Dense(745, input_dim = 745, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train.LABEL_Sepsis, epochs=4, batch_size=100)
loss, metric = model.evaluate(X_train, Y_train.LABEL_Sepsis)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [0]:
roc_sepsis = roc_auc_score(Y_test.LABEL_Sepsis, model.predict(X_test).ravel())


In [0]:
print(str(roc_sepsis))

0.6233668580425369


In [0]:
########################
## TRAINING SUBTASK 3 ##
########################

In [0]:
clf_RRate = linear_model.Lasso(alpha=0.1, max_iter=10000).fit(X_train,Y_train.LABEL_RRate)
clf_ABPm = linear_model.Lasso(alpha=1, max_iter=10000).fit(X_train,Y_train.LABEL_ABPm)
clf_SpO2 = linear_model.Lasso(alpha=0.1, max_iter=10000).fit(X_train,Y_train.LABEL_SpO2)
clf_Heartrate = linear_model.Lasso(alpha=0.1, max_iter=10000).fit(X_train,Y_train.LABEL_Heartrate)

In [0]:
roc_RRate = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_RRate, clf_RRate.predict(X_test)))
roc_ABPm = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_ABPm, clf_ABPm.predict(X_test)))
roc_SpO2 = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_SpO2, clf_SpO2.predict(X_test)))
roc_Heartrate = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_Heartrate, clf_Heartrate.predict(X_test)))
print('RRate: ' + str(roc_RRate))
print('ABPm: ' + str(roc_ABPm))
print('SpO2: ' + str(roc_SpO2))
print('Heartrate: ' + str(roc_Heartrate))
print('Total: ' + str((roc_RRate+roc_ABPm+roc_SpO2+roc_Heartrate)/4))

RRate: 0.6729770697018428
ABPm: 0.8228818026502227
SpO2: 0.7394361875387249
Heartrate: 0.8116058439761762
Total: 0.7617252259667416


In [0]:
###############################
###############################
###############################

In [0]:
#################
## SUBMISSION ##
################

In [0]:
features = pd.read_csv('test_features.csv')

In [0]:
submission = features[['pid']].iloc[::12]

In [0]:
features

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,RRate,Fibrinogen,Phosphate,WBC,Creatinine,PaCO2,AST,FiO2,Platelets,SaO2,Glucose,ABPm,Magnesium,Potassium,ABPd,Calcium,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,0,1,39.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,2,39.0,,44.2,17.0,,36.0,10.2,13.0,,,147.0,6.0,17.5,2.2,,32.0,0.60,194.0,,273.0,77.0,2.2,4.6,76.0,8.0,119.0,100.0,,98.0,31.0,82.0,21.8,,119.0,
2,0,3,39.0,,,,,,,,-9.0,13.0,,,,,26.0,,0.55,,,,78.0,,,72.5,,,100.0,,,,78.0,,,125.0,7.34
3,0,4,39.0,,,,,,,,,12.0,,,,,,,0.50,,,,87.0,,,66.0,,,100.0,,,,80.0,,,136.0,
4,0,5,39.0,,,,,,,,,,,,,,,,,,,,86.0,,,65.0,,,100.0,,,,83.0,,,135.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151963,9997,8,57.0,,,,,,,,,14.0,,,,,,,,,,,67.0,,,48.0,,,100.0,,,,84.0,,,103.0,
151964,9997,9,57.0,,,,,,,,,14.0,,,,,,,0.40,,,,71.0,,,51.0,,,100.0,,,,83.0,,,110.0,
151965,9997,10,57.0,,,,,,,,,16.0,,,,,,,,,,,71.0,,,50.0,,,100.0,,,,88.0,,,111.0,
151966,9997,11,57.0,,,,,37.0,,,,17.0,,,,,,,0.40,,,,75.0,,,52.0,,,100.0,,,,89.0,,,118.0,


In [0]:
#####################
## data imputation ##
#####################

pids = get_unique_pids(features)

# change medical tests to boolean values
features['BaseExcessTest'] = features['BaseExcess'].notnull().astype('int')
features['EtCO2Test'] = features['EtCO2'].notnull().astype('int')
features['PTTTest'] = features['PTT'].notnull().astype('int')
features['BUNTest'] = features['BUN'].notnull().astype('int')
features['LactateTest'] = features['Lactate'].notnull().astype('int')
features['HgbTest'] = features['Hgb'].notnull().astype('int')
features['HCO3Test'] = features['HCO3'].notnull().astype('int')
features['BaseExcessTest'] = features['BaseExcess'].notnull().astype('int')
features['FibrinogenTest'] = features['Fibrinogen'].notnull().astype('int')
features['PhosphateTest'] = features['Phosphate'].notnull().astype('int')
features['WBCTest'] = features['WBC'].notnull().astype('int')
features['CreatinineTest'] = features['Creatinine'].notnull().astype('int')
features['PaCO2Test'] = features['PaCO2'].notnull().astype('int')
features['ASTTest'] = features['AST'].notnull().astype('int')
features['FiO2Test'] = features['FiO2'].notnull().astype('int')
features['PlateletsTest'] = features['Platelets'].notnull().astype('int')
features['SaO2Test'] = features['SaO2'].notnull().astype('int')
features['GlucoseTest'] = features['Glucose'].notnull().astype('int')
features['MagnesiumTest'] = features['Magnesium'].notnull().astype('int')
features['PotassiumTest'] = features['Potassium'].notnull().astype('int')
features['CalciumTest'] = features['Calcium'].notnull().astype('int')
features['AlkalinephosTest'] = features['Alkalinephos'].notnull().astype('int')
features['Bilirubin_directTest'] = features['Bilirubin_direct'].notnull().astype('int')
features['ChlorideTest'] = features['Chloride'].notnull().astype('int')
features['HctTest'] = features['Hct'].notnull().astype('int')
features['Bilirubin_totalTest'] = features['Bilirubin_total'].notnull().astype('int')
features['TroponinITest'] = features['TroponinI'].notnull().astype('int')
features['pHTest'] = features['pH'].notnull().astype('int')


# take mean for each patient
for currentpid in pids:
    currentfeatures = features.loc[features['pid'] == currentpid]
    #features.loc[features['pid'] == currentpid] = currentfeatures.fillna(currentfeatures.mean())
    features.loc[features['pid'] == currentpid] = currentfeatures.interpolate(limit_direction='both')

# for patients without any value, take overall mean
features = features.fillna(features.mean())

#normalize
features=pd.concat([features.iloc[:,:1],(features.iloc[:,1:]-features.iloc[:,1:].mean())/features.iloc[:,1:].std()], axis=1)

In [0]:
# one line per patient
features = aggregate_all(features)
del features['Age_2']
del features['Age_3']
del features['Age_4']
del features['Age_5']
del features['Age_6']
del features['Age_7']
del features['Age_8']
del features['Age_9']
del features['Age_10']
del features['Age_11']
del features['Age_12']

features['pid'] = features['pid'].astype(str)
features = features.sort_values('pid')
features = features.iloc[:,1:]

In [0]:
# run predictions for test data and fill in submission
submission['LABEL_BaseExcess'] = clf_BaseExcess.predict_proba(features)[:,1]
submission['LABEL_Fibrinogen'] = clf_Fibrinogen.predict_proba(features)[:,1]
submission['LABEL_AST'] = clf_AST.predict_proba(features)[:,1]
submission['LABEL_Alkalinephos'] = clf_Alkalinephos.predict_proba(features)[:,1]
submission['LABEL_Bilirubin_total'] = clf_Bilirubin_total.predict_proba(features)[:,1]
submission['LABEL_Lactate'] = clf_Lactate.predict_proba(features)[:,1]
submission['LABEL_TroponinI'] = clf_TroponinI.predict_proba(features)[:,1]
submission['LABEL_SaO2'] = clf_SaO2.predict_proba(features)[:,1]
submission['LABEL_Bilirubin_direct'] = clf_Bilirubin_direct.predict_proba(features)[:,1]
submission['LABEL_EtCO2'] = clf_EtCO2.predict_proba(features)[:,1]
submission['LABEL_Sepsis'] = clf_sepsis.predict_proba(features)[:,1]
submission['LABEL_RRate'] = clf_RRate.predict(features)
submission['LABEL_ABPm'] = clf_ABPm.predict(features)
submission['LABEL_SpO2'] = clf_SpO2.predict(features)
submission['LABEL_Heartrate'] = clf_Heartrate.predict(features)

In [0]:
submission.to_csv('submission.zip', index=False, float_format='%.3f', compression=dict(method='zip', archive_name='submission.csv'))

In [0]:
submission

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.412573,0.299868,0.626883,0.659479,0.653685,0.483255,0.140236,0.337796,0.124280,0.094574,0.087048,15.065407,79.841264,98.372504,82.779663
12,10001,0.063298,0.018938,0.172728,0.164376,0.171134,0.087495,0.022931,0.095002,0.014374,0.020109,0.044623,17.665492,90.595038,95.179559,99.221587
24,10003,0.039893,0.012476,0.142033,0.123719,0.144891,0.116454,0.072042,0.099911,0.023694,0.026638,0.050712,17.230845,81.666891,97.941036,88.875714
36,10004,0.036982,0.031970,0.300603,0.282300,0.284434,0.100441,0.025878,0.108550,0.026159,0.049084,0.040839,16.229129,75.193079,96.012762,88.823899
48,10005,0.111740,0.028541,0.173529,0.154908,0.170443,0.130121,0.028641,0.119234,0.035219,0.015885,0.050626,19.052263,74.626182,96.118954,62.657419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151908,9989,0.106292,0.053149,0.181930,0.170017,0.199932,0.112124,0.017770,0.101031,0.020059,0.015912,0.055389,20.317158,76.060151,96.100802,98.836507
151920,9991,0.356480,0.087786,0.262974,0.269495,0.271979,0.236594,0.050539,0.267651,0.040251,0.016132,0.061219,19.427408,90.578018,98.097897,74.957591
151932,9992,0.768727,0.037288,0.110873,0.098459,0.109983,0.157163,0.018004,0.646763,0.013812,0.013219,0.046286,18.898331,69.045568,97.412229,84.247518
151944,9994,0.516685,0.347385,0.563792,0.594999,0.594574,0.584467,0.146993,0.407296,0.119119,0.089032,0.087148,16.888565,88.870415,98.224518,93.609248
