In [2]:
import pandas as pd
import sklearn
import numpy as np
from sklearn import *
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict 
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
#from sklearn.experimental import enable_hist_gradient_boosting
#from sklearn.ensemble import HistGradientBoostingClassifier
#from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

ImportError: cannot import name 'HistGradientBoostingClassifier' from 'sklearn.ensemble' (C:\Users\paulm\Anaconda3\lib\site-packages\sklearn\ensemble\__init__.py)

In [3]:
def preprocess(data):
    number_of_hours = 12
    number_of_patients = int(data.shape[0]/number_of_hours)
    statistical_features = int(data.shape[1])-3  # Number of columns - pid - age - time

    # Transforming train features into an array where each element is one patient
    train_features_array = np.array(data)
    patients = np.vsplit(train_features_array , number_of_patients)
    patients_array = np.array(patients)
    
    # We are creating a new matrix (Phi) that contains statistical features for every patient: pid, age, features: mean, var, max, min, median
    features = np.zeros([number_of_patients, 2+5*statistical_features]) # no. of patients, no. of features* no. of columns names(age, calcium, bolirubin...)
    # columns are: pid, age, mean(of all featuers), var, max, min, median
    
    features[:,0] = patients_array[:,0,0] # 1st column is Patient id
    features[:,1] = patients_array[:,0,2] # 2nd column is Age
    
    means = np.nanmean(patients_array[:, :, 3:], axis=1) # rest of the columns are means, var... of all values
    features[:,2:statistical_features+2] = means
    
    variances = np.nanvar(patients_array[:, :, 3:], axis=1);
    features[:,statistical_features+2:2*statistical_features+2] = variances
    
    maxs = np.nanmax(patients_array[:, :, 3:], axis=1);
    features[:,2*statistical_features+2:3*statistical_features+2] = maxs
    
    mins = np.nanmin(patients_array[:, :, 3:], axis=1);
    features[:,3*statistical_features+2:4*statistical_features+2] = mins
    
    medians = np.nanmedian(patients_array[:, :, 3:], axis=1);
    features[:,4*statistical_features+2:5*statistical_features+2] = medians
    
    # Replacing the NaN values with the mean of all patients
    all_means = np.nanmean(features, axis=0)  # Calculate the mean of each features of all patients
    
    # find indexes where NaNs are and replace them by the column mean
    indexes = np.where(np.isnan(features))
    features[indexes] = np.take(all_means, indexes[1])
    
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean') # I put imputer here just because they mentioned it in the task description
    # features = imp.fit_transform(features)
        
    return features

In [None]:
# Loading the files
train_features = pd.read_csv("train_features.csv") # training X
train_labels = pd.read_csv("train_labels.csv")     # training Y
test_features = pd.read_csv("test_features.csv")   # testing X

In [None]:
# Preprocessing both train and test features: creating an array of patients, creating statistical features (mean, var, max..)
# for all features for each patient, getting rid of NaN values with filling them with means.
train_X = preprocess(train_features)
test_X = preprocess(test_features)

#Standardizing the features
scaler = StandardScaler()
train_X[:,2:] = scaler.fit_transform(train_X[:,2:])  # Skipping pid and age

test_X[:,2:] = scaler.fit_transform(test_X[:,2:])

# Exporting train_X into csv file for better observing
df = pd.DataFrame(train_X)
df.to_csv('train_X2.csv', index=False, float_format='%.3f')

We can ignore these warnings. It reports that somewhere in calculating mean/var.. of all values we get NaN for columns that contist only of NaN's. Afterwards we were dealing with these NaN values.

### Parts 1 & 2 (They're basically the same)

In [None]:
# Getting the data we need for subtasks 1 & 2
train_Y = np.array(train_labels.loc[: , "pid":"LABEL_Sepsis"])
no_columns = 12

In [None]:
# We are using cross validation to evaluate different models.
# We are using ROC scoring bcs it is how the performance of the model should be evaulated.

#classifier = HistGradientBoostingClassifier()  # Uncomment the classifier you want to try
classifier = GradientBoostingClassifier() #This one doesn't run on my computer

for i in range(1, no_columns) :  # We are calculating ROC scores with cross validation for all tests, one by one.
    score = cross_val_score(classifier, train_X, train_Y[:, i], cv=10, scoring='roc_auc')
    print ('ROC score: ', score.mean())   # 'score' will be an array that contains 10 scores 
                                          # (from 10 validations since cv=10). We are reporting the mean of those.
# ROC score should be close to 1 and ideally 1. (1 will probably mean overfitting though). So try to run this code for different
# classifiers and chose the one that gives the highest (reasonable) ROC score.

In [None]:
# Here we use the classifier chosen above in order to predict labels for the test sets.
 
predictions_part_1_2 = np.zeros([test_X.shape[0], no_columns])

for i in range(1, no_columns) :  # We are fitting columns (medical tests) one by one
    
    classifier.fit(train_X, train_Y[:, i])
    output_proba = classifier.predict_proba(test_X) 
    predictions_part_1_2[:, i] = output_proba[:,1]  # Adding the column of the predicted labels into the resulting matrix

In [None]:
predictions_part_1_2[:, 0] = test_X[:, 0].astype(int) # 1st column is just pid-s

### Part 3

In [None]:
# Getting the data we need for subtask 3
train_Y = np.array(train_labels.loc[: , "LABEL_RRate":"LABEL_Heartrate"])
no_columns = 4

In [None]:
# We are using cross validation to evaluate different models.
# We are using R2 scoring bcs it is how the performance of the model should be evaulated.

# regressor = SGDRegressor(max_iter = 1000)   # Uncomment the redressor you want to try
regressor = HistGradientBoostingRegressor()  



for i in range(no_columns) :  # We are calculating ROC scores with cross validation for all tests, one by one.
    score = cross_val_score(regressor, train_X, train_Y[:, i], cv=5, scoring='r2')
    print ('R2 score: ', score.mean())   
    
# R2 score should be close to 1 and ideally 1. (1 will probably mean overfitting though). So try to run this code for different
# classifiers and chose the one that gives the highest (reasonable) R2 score.

In [None]:
# Here we use the regressor chosen above in order to predict labels for the test sets.

predictions_part_3 = np.zeros([test_X.shape[0], no_columns])
for i in range(no_columns) :
    regressor.fit(train_X, train_Y[:, i])
    output = regressor.predict(test_X)
    predictions_part_3[:, i] = output

### Write into csv

In [None]:
output = np.concatenate((predictions_part_1_2,predictions_part_3), axis = 1)

In [None]:
df = pd.DataFrame(output, columns = ['pid','LABEL_BaseExcess','LABEL_Fibrinogen','LABEL_AST','LABEL_Alkalinephos','LABEL_Bilirubin_total','LABEL_Lactate','LABEL_TroponinI','LABEL_SaO2','LABEL_Bilirubin_direct','LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate','LABEL_ABPm','LABEL_SpO2','LABEL_Heartrate'])

df.to_csv('prediction.csv', index=False, float_format='%.3f')  # I left this here for easier overview
df.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')