<a href="https://colab.research.google.com/github/pullelys/iml-project/blob/main/task_2/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==2.1.0
!pip install keras==2.3.1

Collecting tensorflow==2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/85/d4/c0cd1057b331bc38b65478302114194bd8e1b9c2bbc06e300935c0e93d90/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 21kB/s 
[?25hCollecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting keras-applications>=1.0.8
[?25l  Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 8.4MB/s 
[?25hCollecting tensorflow-estimator<2.2.0,>=2.1.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/18/90/b77c328a1304437ab1310b463e533fa7689f4bfc41549593056d812fab8e/tensorflow_estimator-2.1.0-py2.py3-none-any.whl (448kB)
[K     |████████████████████████████████| 450

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import time
import keras
from keras.models import Model
from keras.layers import Input, Activation, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from sklearn.base import BaseEstimator, ClassifierMixin
from keras.constraints import maxnorm
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

Using TensorFlow backend.


In [None]:
# copy necessary files from Google Cloud Storage (GCS) to local disk
bucket_name = 'colab-bucket-86f9472c-3ef9-11eb-a0d2-0242ac1c0002'
files = ['train_labels.csv', 'train_features.csv', 'test_features.csv']
for file in files:
  !gsutil cp gs://{bucket_name}/{file} .

Copying gs://colab-bucket-86f9472c-3ef9-11eb-a0d2-0242ac1c0002/train_labels.csv...
/ [1 files][891.7 KiB/891.7 KiB]                                                
Operation completed over 1 objects/891.7 KiB.                                    
Copying gs://colab-bucket-86f9472c-3ef9-11eb-a0d2-0242ac1c0002/train_features.csv...
- [1 files][ 34.2 MiB/ 34.2 MiB]                                                
Operation completed over 1 objects/34.2 MiB.                                     
Copying gs://colab-bucket-86f9472c-3ef9-11eb-a0d2-0242ac1c0002/test_features.csv...
/ [1 files][ 22.8 MiB/ 22.8 MiB]                                                
Operation completed over 1 objects/22.8 MiB.                                     


In [None]:
# define main file name
main_filename = 'main'

# read and sort data
X_train = pd.read_csv('train_features.csv').sort_values(by=['pid', 'Time'])
y_train = pd.read_csv('train_labels.csv').sort_values(by=['pid'])
X_test = pd.read_csv('test_features.csv').sort_values(by=['pid', 'Time'])

# define column names as specified in the correct submission format
# partition them into the corresponding subtasks
subtask_1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 
             'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 
             'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
subtask_2 = ['LABEL_Sepsis']
subtask_3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

# initialize df_predictions with unique 'pid' column from X_test
df_predictions = pd.DataFrame(X_test['pid'].unique(), columns=['pid'])

In [None]:
#### CLASSIFICATION: SUBTASK 1 & 2

# prepare features, disregard 'pid' and 'Time' columns
relevant_features = X_train.columns[2:]

# feature engineering
# For each ‘pid’ and relevant column, we compute the following features: 
# mean, min, max, difference between min and max, first available (i.e. not nan) observation, 
# last available observation, difference between first and last, 
# the number of missing values over all 12 observations. 
# Whenever there is no observation per ‘pid’ and relevant column, 
# we impute the value with the mean of that column over the entire dataset.
def clf_features(X):
    X_mean = X[relevant_features].mean()

    X_pid_mean = X.groupby(['pid'], as_index=False)[relevant_features].mean().drop(['pid'], axis=1)
    X_pid_mean.fillna({col:X_mean[col] for col in X_pid_mean.columns}, inplace=True)

    X_pid_min = X.groupby(['pid'], as_index=False)[relevant_features].min().drop(['pid', 'Age'], axis=1)
    X_pid_min.fillna({col:X_mean[col] for col in X_pid_min.columns}, inplace=True)

    X_pid_max = X.groupby(['pid'], as_index=False)[relevant_features].max().drop(['pid', 'Age'], axis=1)
    X_pid_max.fillna({col:X_mean[col] for col in X_pid_max.columns}, inplace=True)

    X_pid_diff_0 = X_pid_max-X_pid_min

    X_pid_first = X.groupby(['pid'], as_index=False)[relevant_features].first().drop(['pid', 'Age'], axis=1)
    X_pid_first.fillna({col:X_mean[col] for col in X_pid_first.columns}, inplace=True)

    X_pid_last = X.groupby(['pid'], as_index=False)[relevant_features].last().drop(['pid', 'Age'], axis=1)
    X_pid_last.fillna({col:X_mean[col] for col in X_pid_last.columns}, inplace=True)

    X_pid_diff_1 = X_pid_last-X_pid_first

    X_pid_missing = X.groupby(['pid'], as_index=False)[relevant_features].count().drop(['pid', 'Age'], axis=1)

    X_final = pd.concat([X_pid_mean, X_pid_min, X_pid_max, X_pid_diff_0, X_pid_first, 
                        X_pid_last, X_pid_diff_1, X_pid_missing], axis=1).values
    return X_final

X_train_clf, X_test_clf = [clf_features(X) for X in [X_train, X_test]]

# define function to create an ANN classifier model
def create_model(input_dim=1, output_dim=1, n1_units=100, n2_units=100, n3_units=100, 
                 activation='relu', optimizer='Adam', visible_drop_rate=0.2, hidden_drop_rate=0.5, 
                 init_mode='glorot_uniform', maxnorm_value=3):
    # input layer
    visible = Input(shape=(input_dim,))
    drop0 = Dropout(visible_drop_rate)(visible)
    # hidden layer 1
    hidden1 = Dense(n1_units, kernel_initializer=init_mode, kernel_constraint=maxnorm(maxnorm_value))(drop0)
    batch1 = BatchNormalization()(hidden1)
    act1 = Activation(activation)(batch1)
    drop1 = Dropout(hidden_drop_rate)(act1)
    # hidden layer 2
    hidden2 = Dense(n2_units, kernel_initializer=init_mode, kernel_constraint=maxnorm(maxnorm_value))(drop1)
    batch2 = BatchNormalization()(hidden2)
    act2 = Activation(activation)(batch2)
    drop2 = Dropout(hidden_drop_rate)(act2)
    # hidden layer 3
    hidden3 = Dense(n3_units, kernel_initializer=init_mode, kernel_constraint=maxnorm(maxnorm_value))(drop2)
    batch3 = BatchNormalization()(hidden3)
    act3 = Activation(activation)(batch3)
    drop3 = Dropout(hidden_drop_rate)(act3)
    # output layers
    multi_output = [Dense(1, activation='sigmoid', kernel_initializer=init_mode)(drop3) for i in range(output_dim)]
    model = Model(inputs=visible, outputs=multi_output)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[keras.metrics.AUC(name='roc_auc')])
    return model


# create custom sklearn classifier s.t. Keras functional API can be used in Pipeline and GridSearchCV
class CustomClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, n1_units=100, n2_units=100, n3_units=100, activation='sigmoid', 
                 optimizer='Adam', visible_drop_rate=0.2, hidden_drop_rate=0.5, 
                 init_mode='glorot_uniform', maxnorm_value=3,
                 batch_size=None, epochs=1, class_weight=None):
        self.n1_units = n1_units
        self.n2_units = n2_units 
        self.n3_units = n3_units         
        self.activation = activation
        self.optimizer = optimizer
        self.visible_drop_rate = visible_drop_rate
        self.hidden_drop_rate = hidden_drop_rate
        self.maxnorm_value = maxnorm_value
        self.init_mode = init_mode
        # model.fit parameters
        self.batch_size = batch_size
        self.epochs = epochs
        self.class_weight = class_weight
        

    def fit(self, X, y, batch_size=None, epochs=1, verbose=0, 
            validation_data=None, class_weight=None):
        # determine output dimension
        if len(y.shape)==1:
            self.output_dim_ = 1
        else:
            self.output_dim_ = y.shape[1]
        
        # reshape target into 2d-array
        y_reshaped = y.reshape(y.shape[0], self.output_dim_)
        
        # create model
        self.model_ = create_model(input_dim=X.shape[1], output_dim=self.output_dim_, 
                                   n1_units=self.n1_units, n2_units=self.n2_units, 
                                   n3_units=self.n3_units, activation=self.activation, 
                                   optimizer=self.optimizer, visible_drop_rate=self.visible_drop_rate, 
                                   hidden_drop_rate=self.hidden_drop_rate, init_mode=self.init_mode, 
                                   maxnorm_value=self.maxnorm_value)
        
        # fit parameters entered in fit method 
        # have priority over the same parameters entered in __init__
        if batch_size is not None:
            fit_batch_size = batch_size
        else:
            fit_batch_size = self.batch_size
        if epochs is not 1:
            fit_epochs = epochs
        else: 
            fit_epochs = self.epochs
        if class_weight is not None:
            fit_class_weight = class_weight
        else: 
            fit_class_weight = self.class_weight
        
        # fit model and save history in self.history_ attribute
        self.history_ = self.model_.fit(
            X, [y_reshaped[:, i] for i in range(self.output_dim_)], 
            batch_size=fit_batch_size, epochs=fit_epochs, verbose=verbose, 
            validation_data=validation_data, class_weight=fit_class_weight
        )
        # return classifier
        return self
    

    def predict(self, X):
        # make prediction
        if self.output_dim_ == 1:
            predictions = self.model_.predict(X, verbose=0)
        else:
            predictions = np.concatenate(self.model_.predict(X, verbose=0), axis=1)
        return predictions
    
    
    # same as predict but predict_proba needs to be defined in order to be able 
    # to use scoring='roc_auc' in GridSearchCV
    def predict_proba(self, X):
        return self.predict(X)


# define pipeline
steps = [('scaler', StandardScaler()), ('ANN', CustomClassifier())]
pipeline = Pipeline(steps)

# compute class_weight='balanced' as in sklearn.utils.class_weight.compute_class_weight
def balanced_class_weight(col_name):
    y = y_train[col_name]
    class_weight = dict(len(y) / (y.nunique() * y.value_counts()))
    return class_weight

class_weight = [balanced_class_weight(col_name) for col_name in subtask_1+subtask_2]

# define parameter choice for param_grid in GridSearchCV
param_grid = [dict(ANN__n1_units=[600], ANN__n2_units=[3000], ANN__n3_units=[200], 
                  ANN__activation=['relu'], ANN__optimizer=['Adam'], 
                  ANN__visible_drop_rate=[0.3, 0.4], ANN__hidden_drop_rate=[0.4, 0.5], 
                  ANN__init_mode=['uniform'], ANN__maxnorm_value=[3], 
                  ANN__epochs=[15], ANN__batch_size=[256], ANN__class_weight=[class_weight]), 
              dict(ANN__n1_units=[800], ANN__n2_units=[1500], ANN__n3_units=[300], 
                  ANN__activation=['relu'], ANN__optimizer=['Adam'], 
                  ANN__visible_drop_rate=[0.3, 0.4], ANN__hidden_drop_rate=[0.4, 0.5], 
                  ANN__init_mode=['uniform'], ANN__maxnorm_value=[3], 
                  ANN__epochs=[15], ANN__batch_size=[256], ANN__class_weight=[class_weight]), 
              dict(ANN__n1_units=[1500], ANN__n2_units=[432], ANN__n3_units=[432], 
                  ANN__activation=['relu'], ANN__optimizer=['Adam'], 
                  ANN__visible_drop_rate=[0.3, 0.4], ANN__hidden_drop_rate=[0.4, 0.5], 
                  ANN__init_mode=['uniform'], ANN__maxnorm_value=[3], 
                  ANN__epochs=[15], ANN__batch_size=[256], ANN__class_weight=[class_weight])]
                  

# define classifier
classifier = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, 
                          cv=5, verbose=50, scoring='roc_auc')

print("# perform classifier grid search")
start_time = time.time()
classifier.fit(X_train_clf, y_train[subtask_1+subtask_2].values)
run_time = time.time()-start_time
print('# perform classifier grid search: runtime: {a:.0f} h {b:.0f} min {c:.2f} s'.format(a=run_time//3600, b=(run_time-(run_time//3600)*3600)//60, c=run_time%60))

print('# make predictions')
df_predictions[subtask_1+subtask_2] = pd.DataFrame(classifier.predict(X_test_clf))

print('# summarize GridSearchCV results')
cv_results = pd.DataFrame(classifier.cv_results_).sort_values(by=['mean_test_score'], ascending=False).loc[:, ['mean_test_score', 'std_test_score', 'params']]
for mean, stdev, param in zip(*[cv_results[col] for col in cv_results]):
    print("{:f} ({:f}) with: {}".format(mean, stdev, param))
    print()

# perform classifier grid search
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: D



[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  27 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed: 12.6min
[Paralle

In [None]:
#### REGRESSION: SUBTASK 3

# prepare features
relevant_features = ['Age', 'Temp', 'ABPd', 'ABPs', 'pH', 'Glucose', 'Hgb']+[col[len('LABEL_'):] for col in subtask_3]

# same feature engineering as for classification above
def regr_features(X):
    X_mean = X[relevant_features].mean()

    X_pid_mean = X.groupby(['pid'], as_index=False)[relevant_features].mean().drop(['pid'], axis=1)
    X_pid_mean.fillna({col:X_mean[col] for col in X_pid_mean.columns}, inplace=True)

    X_pid_min = X.groupby(['pid'], as_index=False)[relevant_features].min().drop(['pid', 'Age'], axis=1)
    X_pid_min.fillna({col:X_mean[col] for col in X_pid_min.columns}, inplace=True)

    X_pid_max = X.groupby(['pid'], as_index=False)[relevant_features].max().drop(['pid', 'Age'], axis=1)
    X_pid_max.fillna({col:X_mean[col] for col in X_pid_max.columns}, inplace=True)

    X_pid_diff_0 = X_pid_max-X_pid_min

    X_pid_first = X.groupby(['pid'], as_index=False)[relevant_features].first().drop(['pid', 'Age'], axis=1)
    X_pid_first.fillna({col:X_mean[col] for col in X_pid_first.columns}, inplace=True)

    X_pid_last = X.groupby(['pid'], as_index=False)[relevant_features].last().drop(['pid', 'Age'], axis=1)
    X_pid_last.fillna({col:X_mean[col] for col in X_pid_last.columns}, inplace=True)

    X_pid_diff_1 = X_pid_last-X_pid_first

    X_pid_missing = X.groupby(['pid'], as_index=False)[relevant_features].count().drop(['pid', 'Age'], axis=1)

    X_regr = pd.concat([X_pid_mean, X_pid_min, X_pid_max, X_pid_diff_0, X_pid_first, 
                        X_pid_last, X_pid_diff_1, X_pid_missing], axis=1).values
    return X_regr

X_train_regr, X_test_regr = [regr_features(X) for X in [X_train, X_test]]

# create regressor pipeline
steps = [('scaler', StandardScaler()), ('regr', Ridge())]
pipeline = Pipeline(steps)

# define parameter choice
# regr=[Ridge()]
alpha = [10**i for i in range(-1, 4)]
# regr=[RandomForestRegressor(n_jobs=-1)]
max_features = ['sqrt']
n_estimators = [300, 400, 600, 700]

param_grid = [dict(regr=[Ridge()], 
                   regr__alpha=alpha), 
              dict(regr=[RandomForestRegressor()], 
                   regr__max_features=max_features, 
                   regr__n_estimators=n_estimators)]

# create regressor
regressor = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1, cv=5, verbose=50, scoring='r2')

print('# perform regression')
start_time = time.time()
best_scores = []
for col_name in subtask_3:
    regressor.fit(X_train_regr, y_train[col_name].values)
    best_scores.append(regressor.best_score_)
    df_predictions[col_name] = regressor.predict(X_test_regr)
    print('########### {}: summarize GridSearchCV results'.format(col_name))
    cv_results = pd.DataFrame(regressor.cv_results_).sort_values(by=['mean_test_score'], ascending=False).loc[:, ['mean_test_score', 'std_test_score', 'params']]
    for mean, stdev, param in zip(*[cv_results[col] for col in cv_results]):
        print("{:f} ({:f}) with:".format(mean, stdev))
        print(param)
        print()

print('########### Mean r2-score: {}'.format(np.mean(best_scores)))
run_time = time.time()-start_time
print('# perform regression: runtime: {a:.0f} h {b:.0f} min {c:.2f} s'.format(a=run_time//3600, b=(run_time-(run_time//3600)*3600)//60, c=run_time%60))

# perform regression
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1528s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.8s
[Parallel(n_jobs



[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:  8.3min remaining:   23.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  9.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 



[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:  7.8min remaining:   21.9s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  8.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 



[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:  8.5min remaining:   23.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  9.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  9.7min finished
########### LABEL_SpO2:



[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.9s
[Paralle

In [None]:
# df_predictions is a pandas dataframe containing the final result
filename = 'Submission_{}.zip'.format(main_filename)
df_predictions[['pid']+subtask_1+subtask_2+subtask_3].to_csv(filename, index=False, float_format='%.5f', compression='zip')