In [167]:
import numpy as np
import pandas as pd
from sklearn import svm, linear_model
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error as rmse
import time
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn import feature_selection
from joblib import dump, load

pid = pd.read_csv('test_features.csv').iloc[:,0:1] # extract pid for submission
pid = pid.values.tolist()
pid = np.array(pid[0::12])
header = pd.read_csv('sample.csv', header = None).loc[0,:] # extract test names for submission
header = list(header)

train_features = pd.read_csv('train_features.csv').iloc[:, 1:] # skip pids
test_features = pd.read_csv('test_features.csv').iloc[:, 1:] # skip pids

train_labels_1 = pd.read_csv('train_labels.csv').loc[:,'LABEL_BaseExcess':'LABEL_Sepsis']
train_labels_3 = pd.read_csv('train_labels.csv').loc[:,'LABEL_RRate':'LABEL_Heartrate']

#subtak 1
Svm = False
Mlp = False
GradientBoostingClassification = True

#subtask 3
normalridge = False
kernelridge = False #(peggiore) lol
GradientBoostingRegressor = True

## PRE PROCESSING

### Imputation

In [168]:
def clean_up(X_nan):
    i = 0
    for col in X_nan.transpose():
        i += 1
        num_non_nan = np.count_nonzero(~np.isnan(col))
        if num_non_nan/col.size < 0.99:
            print('colonna ', i, 'ha meno dell 99% di dati significativi')
            print('Dimensioni della matrice:', X_nan.shape)
            X_nan = np.delete(X_nan, i, 1)
    return X_nan

In [169]:
def sk_imputation(X_nan, method):
    num_pazienti = int(X_nan.shape[0]/12)
    X = np.empty((num_pazienti, X_nan.shape[1]))

    for rows in np.arange(0,X_nan.shape[0],12):
        X[int(rows/12),:] = np.nanmean(X_nan[rows:rows+11, :], axis=0)
    imputer = SimpleImputer(missing_values=np.nan, strategy=method, fill_value=0)
    X = imputer.fit_transform(X)
    return(X)

In [170]:
def sk_imputation_iterative(X_nan):
    num_pazienti = int(X_nan.shape[0]/12)
    X = np.empty((num_pazienti, X_nan.shape[1]*2))
    for rows in np.arange(0,X_nan.shape[0],12):
        X[int(rows/12),:] = np.array(list(zip(np.nanmean(X_nan[int(rows):int(rows+11), :],axis=0),np.nanvar(X_nan[int(rows):int(rows+11), :],axis=0)))).flatten()
        #X[int(rows/12),:] = np.nanmean(X_nan[rows:rows+11, :], axis=0)
    imp_mean = IterativeImputer(random_state=0, max_iter = 100, initial_strategy='most_frequent', verbose = 2)
    imp_mean.fit(X)
    X = imp_mean.transform(X)
    return(X)

### Validation split

In [171]:
def validation(X_train, y_label, do_validation = False):
    # function to split X_train and y_label for validation and hyperparameter tuning
    if do_validation:
        nvalid = 0.2 # 0 means no validation set (ratio!), 0.2 = 20% on validation and 80% on training
        x_train, x_valid, y_train, y_valid = train_test_split(X_train, y_label, test_size=nvalid, random_state=12345)
    else:
        x_train = X_train
        y_train = y_label

    return(x_train, y_train)

### Data normalization

In [172]:
X_train = np.array(train_features)
X_test = np.array(test_features)

Simple = True
if Simple:
    X_train = sk_imputation(X_train, 'constant')
    X_test = sk_imputation(X_test, 'constant')
else:
    X_train = sk_imputation_iterative(X_train)
    X_test = sk_imputation_iterative(X_test)

NormalizeData = True
if NormalizeData :
    scaler = StandardScaler()
    #scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

#print(X_train)

  X[int(rows/12),:] = np.nanmean(X_nan[rows:rows+11, :], axis=0)
  X[int(rows/12),:] = np.nanmean(X_nan[rows:rows+11, :], axis=0)


## SUB-TASK 1 SVM

In [173]:
# cell to visualize correletion, uncomment below
x_train, y_train = X_train, np.array(train_labels_1)
tot_medical_test = np.array(train_labels_1).shape[1]
print("tot_medical_test",tot_medical_test)
df = pd.DataFrame(data=np.append(x_train,np.asmatrix(y_train),axis = 1))
corrMatrix = df.corr()
#plt.figure(figsize=(35, 35))
#sn.heatmap(corrMatrix, annot=True)

tot_medical_test 11


In [174]:
def get_valid_indices(original_mat, new_mat):
    valid_indices=[]
    array_to_index = new_mat[0,:]
    array_to_parse = original_mat[0,:]
    for elem in array_to_index:
        var = np.where(array_to_parse==elem)
        valid_indices.append(var[0][0])
    #print(valid_indices)
    return valid_indices

In [175]:
# feature selection with various statistics...
x_train, y_train, y_train_task3 = X_train, np.array(train_labels_1), np.array(train_labels_3)
#print(x_train.shape)
#correlations = feature_selection.chi2(x_train, y_train)
valid_indices = []
for y_considered in y_train.T:
#y_considered = y_train[:,0]
    x_selected_f_reg = feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=20).fit_transform(x_train,y_considered)

    x_selected_anova =feature_selection.SelectKBest(score_func=feature_selection.f_classif, k=29).fit_transform(x_train,y_considered)

    x_selected_false_pos = feature_selection.SelectFpr(feature_selection.f_regression, alpha=0.001).fit_transform(x_train, y_considered)

    indices = get_valid_indices(x_train,x_selected_anova) # current indices of selected feature
    valid_indices.append(indices)

valid_indices_task3 = []
for y_considered in y_train_task3.T:

    x_selected_f_reg = feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=20).fit_transform(x_train,y_considered)

    x_selected_anova =feature_selection.SelectKBest(score_func=feature_selection.f_classif, k=29).fit_transform(x_train,y_considered)

    x_selected_false_pos = feature_selection.SelectFpr(feature_selection.f_regression, alpha=0.001).fit_transform(x_train, y_considered)

    indices = get_valid_indices(x_train,x_selected_anova) # current indices of selected feature
    valid_indices_task3.append(indices)


# Sub Task 1

In [176]:
if Svm:
    clf_1 = {}
    #output_1 = np.empty(tot_medical_test)
    clf_1[0] = svm.SVC(probability=True)
    clf_1[0].fit(x_train, y_train[:,0])
    res = clf_1[0].predict_proba(X_test)
    print(res.shape)
    output_1 = np.asmatrix(res[:,1]).transpose()
    print('output_1 dimentions: ', output_1.shape)
    for medical_test in range(1,tot_medical_test):
        print('medical test: ', medical_test)
        start = time.time()
        clf_1[medical_test] = svm.SVC(probability=True)
        clf_1[medical_test].fit(x_train, y_train[:,medical_test])
        res = clf_1[medical_test].predict_proba(X_test)
        print(res.shape)
        print('output_1 dimentions: ', output_1.shape)
        output_1 = np.append(output_1, np.asmatrix(res[:,1]).transpose(), axis = 1)
        print(time.time()-start)

In [177]:
def mlp_classification(X_train, X_test, y_train):
    clf = MLPClassifier(random_state=1, max_iter=1000).fit(X_train, y_train)
    outupt = clf.predict_proba(X_test)
    scores = clf.score(X_train,y_train)
    return outupt
if Mlp:
    output_1 = []
    for medical_test in range(tot_medical_test):
        print('--- test numero: ', medical_test, ' ---')
        start = time.time()
        output = mlp_classification(X_train,X_test,y_train[:, medical_test])
        output_1.append(output[:,1])
        print(f"output current step {output} with dimension {np.shape(output_1)}")
        print("time for a fitting", time.time()-start)
        output_1 = np.array(output_1).transpose()

In [178]:
if GradientBoostingClassification:

    score_vec = []
    # first prediction
    print("\n start with number : ",1)
    clf = HistGradientBoostingClassifier(loss = 'binary_crossentropy', max_iter=10**10, l2_regularization=1, verbose=0, warm_start=True, tol=10**-10, learning_rate=0.001).fit(X_train[:,valid_indices[0]], y_train[:,0])
    dump(clf, 'classific_model_0.joblib')
    print("\n finished with number : ",1)
    res = clf.predict_proba(X_test[:,valid_indices[0]])
    scores = clf.score(X_train[:,valid_indices[0]], y_train[:,0])
    score_vec.append(scores)
    output_1 = np.asmatrix(res[:,1]).transpose()

    for medical_test in range(1,tot_medical_test):

        modelname = "classific_model_" + str(medical_test) + ".joblib"
        print("\n start with model: ", modelname)

        clf = HistGradientBoostingClassifier(loss = 'binary_crossentropy', max_iter=10**10, l2_regularization=1, verbose=0, warm_start=True, tol=10**-10, learning_rate=0.001).fit(X_train[:,valid_indices[medical_test]], y_train[:,medical_test])

        dump(clf, modelname)

        res = clf.predict_proba(X_test[:,valid_indices[medical_test]])
        scores = clf.score(X_train[:,valid_indices[medical_test]], y_train[:,medical_test])

        score_vec.append(scores)
        output_1 = np.append(output_1, np.asmatrix(res[:,1]).transpose(), axis = 1)

        print("\n finished with number : ", medical_test)

        #print('- output_1 dimentions: ', np.shape(output_1))
        #print('processing time:    ',time.time()-start)
    output_1 = np.array(output_1)
    #print('- output_1 dimentions: ', output_1.shape)
    print('scores for training : ', score_vec)




 start with number :  1

 finished with number :  1

 start with model:  classific_model_1.joblib

 finished with number :  1

 start with model:  classific_model_2.joblib

 finished with number :  2

 start with model:  classific_model_3.joblib

 finished with number :  3

 start with model:  classific_model_4.joblib

 finished with number :  4

 start with model:  classific_model_5.joblib

 finished with number :  5

 start with model:  classific_model_6.joblib

 finished with number :  6

 start with model:  classific_model_7.joblib

 finished with number :  7

 start with model:  classific_model_8.joblib

 finished with number :  8

 start with model:  classific_model_9.joblib

 finished with number :  9

 start with model:  classific_model_10.joblib

 finished with number :  10
scores for training :  [0.8892866543827322, 0.943564095814688, 0.8101605685706765, 0.8116346406949198, 0.8025796262174256, 0.8468017899447223, 0.9347196630692287, 0.8503290339563043, 0.9694130034219531, 0.

## SUB-TASK 3

In [179]:
if normalridge:
    vital_signs = 4
    y_3 = np.array(train_labels_3)
    pred_label_vec = []
    for sign in range(vital_signs):
        lambda_vec = np.array([0.1, 1, 10, 100, 200, 500, 1000])  #### lambda vector
        number_folds = 10  #### Cross-validation folds

        # 1. Iteratively split the date in K = 10 folds
        # K-fold cross-validation through sklearn
        kfolds_class = KFold(number_folds, shuffle=True)
        RMSE_4_lambda = []

        # 2. Ridge regression with iteratively different lambdas
        for lbd in lambda_vec:
            RMSE_list = []
            regression_class = linear_model.Ridge(alpha=lbd, solver='svd')
            for train_index, test_index in kfolds_class.split(X_train):
                x_train, x_validation = X_train[train_index], X_train[test_index]
                y_tr, y_val = y_3[train_index, sign], y_3[test_index, sign]

                regression_class.fit(x_train, y_tr)
                pred_label = regression_class.predict(x_validation)

                RMSE_list.append(rmse(pred_label, y_val)**0.5)

            RMSE_list = np.array(RMSE_list)
            RMSE_4_lambda.append(np.average(RMSE_list))

        RMSE_4_lambda = np.array(RMSE_4_lambda)

        best_idx = np.argmin(RMSE_4_lambda)
        best_lambda = lambda_vec[best_idx]

        regression_class = linear_model.Ridge(alpha=best_lambda, solver='svd')
        regression_class.fit(X_train, y_3[:,sign])
        pred_label = regression_class.predict(X_test)
        pred_label_vec.append(pred_label)

    output_3 = np.array(pred_label_vec).transpose()
    print("all rmse", RMSE_4_lambda)
    print("best lambda", best_lambda)



In [180]:
if kernelridge:
    vital_signs = 4
    y_3 = np.array(train_labels_3)
    pred_label_vec = []
    for sign in range(vital_signs):
        print('--- vital sign', sign+1, 'out of', vital_signs)
        krr = KernelRidge(alpha=1.0, kernel = 'polynomial')
        krr.fit(X_train, y_3[:,sign])
        pred_label = regression_class.predict(X_test)
        pred_label_vec.append(pred_label)

    pred_label_vec = np.array(pred_label_vec)
    output_3 = np.array(pred_label_vec).transpose()
    print(output_3.shape)

In [190]:
if GradientBoostingRegressor:
    vital_signs = 4
    y_3 = np.array(train_labels_3)
    output_3 = []
    scores_vec = []
    #for sign in range(vital_signs):
    for sign in range(0,vital_signs):
    # Loop for training all or specific model
        modelname = "regression_model_" + str(sign) + ".joblib"
        print("\n start with model : ", modelname)

        est = HistGradientBoostingRegressor(max_iter=10**10, l2_regularization=1, loss="poisson", warm_start=True, verbose=0, tol=10**-10, learning_rate=0.001).fit(X_train[:,valid_indices_task3[sign]], y_3[:,sign])

        dump(est, modelname)
        print("\n finished training with number : ", sign)

    for sign in range(vital_signs):
    # Load model and perform inference
        est = load('regression_model_' + str(sign) + '.joblib')
        output = est.predict(X_test[:,valid_indices_task3[sign]])
        output_3.append(output)
        scores = est.score(X_train[:,valid_indices_task3[sign]], y_3[:,sign])
        scores_vec.append(scores)
        print("\n finished prediction : ", sign)

    output_3 = np.array(output_3).transpose()
    print("\n scores for each sign", scores_vec)

    # [0.47423892473321194, 0.6447733667120648, 0.3979460554973725, 0.643170029603557]
    #  [0.45289998264792863, 0.6474615590943178, 0.38271032990489495, 0.6514824627486163]


 start with model :  regression_model_0.joblib

 finished training with number :  0

 finished prediction :  0

 finished prediction :  1

 finished prediction :  2

 finished prediction :  3

 scores for each sign [0.4657236733007133, 0.6474615590943178, 0.4159599334998936, 0.6514824627486163]


## Submission

In [192]:
output_tot = pid
#print(header.shape)
output_tot = np.append(output_tot, output_1, axis = 1)
output_tot = np.append(output_tot, output_3, axis = 1)
#output_tot = np.concatenate((header, output_tot), axis=0)
df = pd.DataFrame(output_tot)
df.to_csv('new_submission.zip', index=False, float_format='%.6f', compression='zip',  header=header)