In [20]:
import pandas as pd
import sklearn
import numpy as np
from sklearn import *
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_curve as rocc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict as CVP
from sklearn.linear_model import LinearRegression

In [2]:
# Loading the files
train_features = pd.read_csv("train_features.csv") # training X
train_labels = pd.read_csv("train_labels.csv") # training Y
test_features = pd.read_csv("test_features.csv") # testing X

In [3]:
def preprocess(data):
    # Transforming train features into an array where each element is one patient
    featureaary = np.array(data)
    number_of_hours = 12
    patients = np.vsplit(featureaary , int(data.shape[0]/number_of_hours))
    patients_array = np.array(patients)
    
    # We are creating a new matrix (Phi) that contains statistical features for every patient: pid, age, mean, var, max, min,median
    features = np.zeros([int(data.shape[0]/number_of_hours), 2+5*34]) # no of patients, no of features* no of columns names(age, calcium, bolirubin...)
    # columns are: pid, age, mean(of all featuers), var, max, min, median
    
    features[:,0] = patients_array[:,0,0] #Patient id
    features[:,1] = patients_array[:,0,2] # Age

    means = np.nanmean(patients_array[:, :, 3:], axis=1)
    statistical_features = int(data.shape[1])-3  # Number of columns - pid - age - time
    
    features[:,2:statistical_features+2] = means
    
    variances = np.nanvar(patients_array[:, :, 3:], axis=1);
    features[:,statistical_features+2:2*statistical_features+2] = variances
    
    maxs = np.nanmax(patients_array[:, :, 3:], axis=1);
    features[:,2*statistical_features+2:3*statistical_features+2] = maxs
    
    mins = np.nanmin(patients_array[:, :, 3:], axis=1);
    features[:,3*statistical_features+2:4*statistical_features+2] = mins
    
    medians = np.nanmedian(patients_array[:, :, 3:], axis=1);
    features[:,4*statistical_features+2:5*statistical_features+2] = medians
    
    #Replacing the NaN values with the mean of all patients
    # Calculate the mean of each features of all patients
    all_means = np.nanmean(features, axis=0)
    
    # find indexes where NaNs are and replace them by the column mean
    indexes = np.where(np.isnan(features))
    features[indexes] = np.take(all_means, indexes[1])
        
    return features

In [4]:
train_X = preprocess(train_features)

  from ipykernel import kernelapp as app
  r = func(a, **kwargs)


We can ignore these warnings as the result still make sense. We get NaN for columns that contist only of NaN's.

now, we have a nice $\Phi$ matrix, we need to take care of the remaining NaN's

In [5]:
test_X = preprocess(test_features)

  from ipykernel import kernelapp as app


Looks good, we finally have usable features to train the model!

### Part 1 & 2 (They're basically the same)

In [6]:
# Getting the data we need for subtask 1 & 2
train_Y = np.array(train_labels.loc[: , "pid":"LABEL_Sepsis"])

In [7]:
model = RFC(random_state = 42)

In [8]:
predictions_part_1 = np.zeros([12664,12])
for i in range(1, 12) :
    model.fit(train_X, train_Y[:, i])
    output_proba = model.predict_proba(test_X)
    predictions_part_1[:, i] = output_proba[:,1]



In [9]:
predictions_part_1[:, 0] = test_X[:, 0]
predictions_part_1

array([[0.0000e+00, 6.0000e-01, 7.0000e-01, ..., 1.0000e-01, 6.0000e-01,
        1.0000e-01],
       [1.0001e+04, 5.0000e-01, 5.0000e-01, ..., 1.0000e-01, 5.0000e-01,
        0.0000e+00],
       [1.0003e+04, 1.0000e-01, 4.0000e-01, ..., 2.0000e-01, 4.0000e-01,
        0.0000e+00],
       ...,
       [9.9920e+03, 6.0000e-01, 2.0000e-01, ..., 1.0000e-01, 5.0000e-01,
        0.0000e+00],
       [9.9940e+03, 8.0000e-01, 7.0000e-01, ..., 3.0000e-01, 5.0000e-01,
        3.0000e-01],
       [9.9970e+03, 8.0000e-01, 2.0000e-01, ..., 1.0000e-01, 5.0000e-01,
        0.0000e+00]])

PART 1 & 2 FINISHED YAY 

### Part 3

In [22]:
#regressor = SGDRegressor(max_iter = 1000)
regressor = LinearRegression()

In [23]:
train_Y = np.array(train_labels.loc[: , "LABEL_RRate":"LABEL_Heartrate"])

In [24]:
predictions_part_3 = np.zeros([12664,4])
for i in range(4) :
    regressor.fit(train_X, train_Y[:, i])
    output = regressor.predict(test_X)
    predictions_part_3[:, i] = output

In [25]:
predictions_part_3

array([[ 12.66736603,  83.67812794,  99.61592843,  75.09890837],
       [ 18.05277743,  89.60932034,  94.65189006, 102.88920332],
       [ 18.42808751,  79.13767478,  97.54655417,  89.90192853],
       ...,
       [ 18.95520558,  68.61558042,  97.08469495,  83.00656884],
       [ 14.91720718,  85.45004304,  98.41375264,  97.11100004],
       [ 18.09709982,  79.0646861 ,  98.42689539,  85.60912834]])

### Write into csv

In [31]:
yay = np.concatenate((predictions_part_1,predictions_part_3), axis = 1)

In [32]:
yay.shape

(12664, 16)

In [35]:
df = pd.DataFrame(yay, columns = ['pid','LABEL_BaseExcess','LABEL_Fibrinogen','LABEL_AST','LABEL_Alkalinephos','LABEL_Bilirubin_total','LABEL_Lactate','LABEL_TroponinI','LABEL_SaO2','LABEL_Bilirubin_direct','LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate','LABEL_ABPm','LABEL_SpO2','LABEL_Heartrate'])

df.to_csv('prediction.csv', index=False, float_format='%.3f')