In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn import *
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict 
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
def preprocess(data):
    number_of_hours = 12
    number_of_patients = int(data.shape[0]/number_of_hours)
    statistical_features = int(data.shape[1])-3  # Number of columns - pid - age - time

    # Transforming train features into an array where each element is one patient
    train_features_array = np.array(data)
    patients = np.vsplit(train_features_array , number_of_patients)
    patients_array = np.array(patients)
    
    # We are creating a new matrix (Phi) that contains statistical features for every patient: pid, age, features: mean, var, max, min, median
    features = np.zeros([number_of_patients, 2+5*statistical_features]) # no. of patients, no. of features* no. of columns names(age, calcium, bolirubin...)
    # columns are: pid, age, mean(of all featuers), var, max, min, median
    
    features[:,0] = patients_array[:,0,0] # 1st column is Patient id
    features[:,1] = patients_array[:,0,2] # 2nd column is Age
    
    means = np.nanmean(patients_array[:, :, 3:], axis=1) # rest of the columns are means, var... of all values
    features[:,2:statistical_features+2] = means
    
    variances = np.nanvar(patients_array[:, :, 3:], axis=1);
    features[:,statistical_features+2:2*statistical_features+2] = variances
    
    maxs = np.nanmax(patients_array[:, :, 3:], axis=1);
    features[:,2*statistical_features+2:3*statistical_features+2] = maxs
    
    mins = np.nanmin(patients_array[:, :, 3:], axis=1);
    features[:,3*statistical_features+2:4*statistical_features+2] = mins
    
    medians = np.nanmedian(patients_array[:, :, 3:], axis=1);
    features[:,4*statistical_features+2:5*statistical_features+2] = medians
    
    # Replacing the NaN values with the mean of all patients
    all_medians = np.nanmedian(features, axis=0)  # Calculate the mean of each features of all patients
    
    # find indexes where NaNs are and replace them by the column mean
    indexes = np.where(np.isnan(features))
    features[indexes] = np.take(all_medians, indexes[1])
    
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean') # I put imputer here just because they mentioned it in the task description
    # features = imp.fit_transform(features)
        
    return features

In [3]:
# Loading the files
train_features = pd.read_csv("train_features.csv") # training X
train_labels = pd.read_csv("train_labels.csv")     # training Y
test_features = pd.read_csv("test_features.csv")   # testing X

In [4]:
# Preprocessing both train and test features: creating an array of patients, creating statistical features (mean, var, max..)
# for all features for each patient, getting rid of NaN values with filling them with means.
train_X = preprocess(train_features)
test_X = preprocess(test_features)

#Standardizing the features
scaler = StandardScaler()
train_X[:,2:] = scaler.fit_transform(train_X[:,2:])  # Skipping pid and age

test_X[:,2:] = scaler.fit_transform(test_X[:,2:])

# Exporting train_X into csv file for better observing
df = pd.DataFrame(train_X)
df.to_csv('train_X2.csv', index=False, float_format='%.3f')

  overwrite_input=overwrite_input)


We can ignore these warnings. It reports that somewhere in calculating mean/var.. of all values we get NaN for columns that contist only of NaN's. Afterwards we were dealing with these NaN values.

### Parts 1 & 2 (They're basically the same)

In [5]:
# Getting the data we need for subtasks 1 & 2
train_Y = np.array(train_labels.loc[: , "pid":"LABEL_Sepsis"])

In [6]:
model = HistGradientBoostingClassifier(random_state = 42)

In [7]:
no_columns = 12
predictions_part_1_2 = np.zeros([test_X.shape[0], no_columns])

for i in range(1, no_columns) :  # We are fitting columns (medical tests) one by one
    
    model.fit(train_X, train_Y[:, i])
    output_proba = model.predict_proba(test_X) 
    predictions_part_1_2[:, i] = output_proba[:,1]
    
    # ROC for analyzing
    print(roc_auc_score(train_Y[:, i], model.predict_proba(train_X)[:, 1]))

0.9699886081049165
0.9752133317094954
0.8916091407872057
0.8719068271179388
0.8511676330829121
0.8916043977088014
0.965048897529664
0.9136123635212411
0.9481187411385505
0.9933098589333478
0.9328472588356836


In [8]:
model.predict_proba(train_X)

array([[0.9697331 , 0.0302669 ],
       [0.97776384, 0.02223616],
       [0.96662617, 0.03337383],
       ...,
       [0.97403857, 0.02596143],
       [0.96337587, 0.03662413],
       [0.93770191, 0.06229809]])

In [9]:
train_Y

array([[1.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+01, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+02, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [9.996e+03, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [9.998e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [9.999e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [10]:
predictions_part_1_2[:, 0] = test_X[:, 0].astype(int) # 1st columns is just pid-s

In [11]:
predictions_part_1_2

array([[0.00000000e+00, 9.58154090e-01, 3.83518902e-01, ...,
        1.98527731e-01, 1.60926542e-03, 7.43454886e-02],
       [1.00010000e+04, 2.34072772e-01, 2.51222503e-02, ...,
        2.16825190e-02, 2.03836107e-02, 2.72148618e-02],
       [1.00030000e+04, 1.50609330e-02, 1.53320967e-02, ...,
        1.22769833e-02, 2.04152165e-02, 4.57730869e-02],
       ...,
       [9.99200000e+03, 3.78262592e-01, 2.28235214e-02, ...,
        1.24983809e-02, 4.33613761e-03, 4.94861161e-02],
       [9.99400000e+03, 9.76032622e-01, 2.79909621e-01, ...,
        4.22132366e-02, 2.23498158e-03, 5.82144119e-02],
       [9.99700000e+03, 7.64669575e-01, 1.09865006e-02, ...,
        1.37837371e-02, 1.33478952e-03, 5.71158379e-02]])

### Part 3

In [12]:
# Getting the data we need for subtask 3
train_Y = np.array(train_labels.loc[: , "LABEL_RRate":"LABEL_Heartrate"])

In [13]:
#regressor = SGDRegressor(max_iter = 1000)
regressor = HistGradientBoostingRegressor()

In [14]:
no_columns = 4
predictions_part_3 = np.zeros([test_X.shape[0], no_columns])
for i in range(no_columns) :
    regressor.fit(train_X, train_Y[:, i])
    output = regressor.predict(test_X)
    predictions_part_3[:, i] = output

### Write into csv

In [15]:
output = np.concatenate((predictions_part_1_2,predictions_part_3), axis = 1)

In [16]:
df = pd.DataFrame(output, columns = ['pid','LABEL_BaseExcess','LABEL_Fibrinogen','LABEL_AST','LABEL_Alkalinephos','LABEL_Bilirubin_total','LABEL_Lactate','LABEL_TroponinI','LABEL_SaO2','LABEL_Bilirubin_direct','LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate','LABEL_ABPm','LABEL_SpO2','LABEL_Heartrate'])

df.to_csv('prediction.csv', index=False, float_format='%.3f')

In [17]:
predictions_part_1_2

array([[0.00000000e+00, 9.58154090e-01, 3.83518902e-01, ...,
        1.98527731e-01, 1.60926542e-03, 7.43454886e-02],
       [1.00010000e+04, 2.34072772e-01, 2.51222503e-02, ...,
        2.16825190e-02, 2.03836107e-02, 2.72148618e-02],
       [1.00030000e+04, 1.50609330e-02, 1.53320967e-02, ...,
        1.22769833e-02, 2.04152165e-02, 4.57730869e-02],
       ...,
       [9.99200000e+03, 3.78262592e-01, 2.28235214e-02, ...,
        1.24983809e-02, 4.33613761e-03, 4.94861161e-02],
       [9.99400000e+03, 9.76032622e-01, 2.79909621e-01, ...,
        4.22132366e-02, 2.23498158e-03, 5.82144119e-02],
       [9.99700000e+03, 7.64669575e-01, 1.09865006e-02, ...,
        1.37837371e-02, 1.33478952e-03, 5.71158379e-02]])