In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn import *
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict 
from sklearn.linear_model import LinearRegression

In [2]:
def preprocess(data):
    number_of_hours = 12
    number_of_patients = int(data.shape[0]/number_of_hours)
    statistical_features = int(data.shape[1])-3  # Number of columns - pid - age - time

    # Transforming train features into an array where each element is one patient
    train_features_array = np.array(data)
    patients = np.vsplit(train_features_array , number_of_patients)
    patients_array = np.array(patients)
    
    # We are creating a new matrix (Phi) that contains statistical features for every patient: pid, age, features: mean, var, max, min, median
    features = np.zeros([number_of_patients, 2+5*statistical_features]) # no. of patients, no. of features* no. of columns names(age, calcium, bolirubin...)
    # columns are: pid, age, mean(of all featuers), var, max, min, median
    
    features[:,0] = patients_array[:,0,0] # 1st column is Patient id
    features[:,1] = patients_array[:,0,2] # 2nd column is Age
    
    means = np.nanmean(patients_array[:, :, 3:], axis=1) # rest of the columns are means, var... of all values
    features[:,2:statistical_features+2] = means
    
    variances = np.nanvar(patients_array[:, :, 3:], axis=1);
    features[:,statistical_features+2:2*statistical_features+2] = variances
    
    maxs = np.nanmax(patients_array[:, :, 3:], axis=1);
    features[:,2*statistical_features+2:3*statistical_features+2] = maxs
    
    mins = np.nanmin(patients_array[:, :, 3:], axis=1);
    features[:,3*statistical_features+2:4*statistical_features+2] = mins
    
    medians = np.nanmedian(patients_array[:, :, 3:], axis=1);
    features[:,4*statistical_features+2:5*statistical_features+2] = medians
    
    # Replacing the NaN values with the mean of all patients
    all_means = np.nanmean(features, axis=0)  # Calculate the mean of each features of all patients
    
    # find indexes where NaNs are and replace them by the column mean
    indexes = np.where(np.isnan(features))
    features[indexes] = np.take(all_means, indexes[1])
        
    return features

In [3]:
# Loading the files
train_features = pd.read_csv("train_features.csv") # training X
train_labels = pd.read_csv("train_labels.csv")     # training Y
test_features = pd.read_csv("test_features.csv")   # testing X

In [4]:
# Preprocessing both train and test features: creating an array of patients, creating statistical features (mean, var, max..)
# for all features for each patient, getting rid of NaN values with filling them with means.
train_X = preprocess(train_features)
test_X = preprocess(test_features)

# Exporting train_X into csv file for better observing
#df = pd.DataFrame(train_X)
#df.to_csv('train_X.csv', index=False, float_format='%.3f')

  means = np.nanmean(patients_array[:, :, 3:], axis=1) # rest of the columns are means, var... of all values
  variances = np.nanvar(patients_array[:, :, 3:], axis=1);
  maxs = np.nanmax(patients_array[:, :, 3:], axis=1);
  mins = np.nanmin(patients_array[:, :, 3:], axis=1);
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


We can ignore these warnings. It reports that somewhere in calculating mean/var.. of all values we get NaN for columns that contist only of NaN's. Afterwards we were dealing with these NaN values.

### Parts 1 & 2 (They're basically the same)

In [5]:
# Getting the data we need for subtasks 1 & 2
train_Y = np.array(train_labels.loc[: , "pid":"LABEL_Sepsis"])

In [9]:
model = RandomForestClassifier(random_state = 42)

In [None]:
no_columns = 12
predictions_part_1_2 = np.zeros([test_X.shape[0], no_columns])

for i in range(1, no_columns) :  # We are fitting columns (medical tests) one by one
    model.fit(train_X, train_Y[:, i])
    output_proba = model.predict_proba(test_X) 
    predictions_part_1_2[:, i] = output_proba[:,1]

In [None]:
predictions_part_1_2[:, 0] = test_X[:, 0] # 1st columns is just pid-s

### Part 3

In [35]:
# Getting the data we need for subtask 3
train_Y = np.array(train_labels.loc[: , "LABEL_RRate":"LABEL_Heartrate"])

In [2]:
#regressor = SGDRegressor(max_iter = 1000)
regressor = LinearRegression()

NameError: name 'LinearRegression' is not defined

In [24]:
no_columns = 4
predictions_part_3 = np.zeros([test_X.shape[0], no_columns])
for i in range(no_columns) :
    regressor.fit(train_X, train_Y[:, i])
    output = regressor.predict(test_X)
    predictions_part_3[:, i] = output

### Write into csv

In [31]:
output = np.concatenate((predictions_part_1_2,predictions_part_3), axis = 1)

In [35]:
df = pd.DataFrame(output, columns = ['pid','LABEL_BaseExcess','LABEL_Fibrinogen','LABEL_AST','LABEL_Alkalinephos','LABEL_Bilirubin_total','LABEL_Lactate','LABEL_TroponinI','LABEL_SaO2','LABEL_Bilirubin_direct','LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate','LABEL_ABPm','LABEL_SpO2','LABEL_Heartrate'])

df.to_csv('prediction.csv', index=False, float_format='%.3f')