# Task 2: Medical dataset

In [437]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression, Ridge, HuberRegressor, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics

import utils
import time

pd.options.display.max_columns = 40

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---
## 0. Preliminary Analysis

In [205]:
feat_train = pd.read_csv("./train_features.csv")
labl_train = pd.read_csv("./train_labels.csv")
feat_test = pd.read_csv("./test_features.csv")

In [97]:
utils.get_score("./train_labels.csv", "./recall_train.zip")

[0.880140104158019, 0.7682702066333779, 0.7163509452862231, 0.7200754850470517, 0.7163416600616637, 0.7564771589791436, 0.7676717431220972, 0.8042015344518387, 0.7284277753577496, 0.9077601581932911]
[0.6519184425430589, 0.7291738267644755, 0.5417486006038639, 0.7896502307334095]
0.7765716771290455 0.6566563580952569 0.6781227751612019


0.7037836034618348

In [206]:
feat_train_reformed = utils.patient_feat_flatten(feat_train)
feat_test_reformed = utils.patient_feat_flatten(feat_test)

In [None]:
feat_train_filled = utils.fill_interp(feat_train_reformed)
# feat_train_filled = pd.read_csv("./intermediate.zip", header=None, names=feat_train_reformed.columns)

In [237]:
feat_train_filled.to_csv("./intermediate.zip", header=False, compression="zip")

In [238]:
feat_train_fill0 = feat_train_filled.fillna(0.0, inplace=False)

---
## 1. Training

In [340]:
feat_train_filled = pd.read_csv("./intermediate.zip", header=None, names=feat_train_reformed.columns)
feat_train_filled = feat_train_filled.groupby(axis=1, level=0).apply(lambda df: df / np.nanmedian(df.to_numpy().flatten()))
feat_train_fill0 = feat_train_filled.fillna(0.0, inplace=False)

In [341]:
feat_train_mat = np.concatenate((feat_train_fill0.to_numpy(), np.ones((feat_train_reformed.shape[0], 1))), axis=1)

In [424]:
rbf_kernel = RBFSampler(gamma=2, n_components=10000, random_state=1)
X_rbf_feat = rbf_kernel.fit_transform(feat_train_mat)

In [425]:
X_rbf_feat

array([[ 0.00124752, -0.00560517, -0.01202498, ..., -0.00682438,
         0.00972497, -0.00853319],
       [-0.01294071,  0.01386927,  0.01289027, ...,  0.0123179 ,
         0.0019333 , -0.01055929],
       [-0.0102816 ,  0.00391926, -0.00992195, ..., -0.01391045,
         0.00951419, -0.01131782],
       ...,
       [-0.00202312,  0.00287168,  0.00664406, ...,  0.01040662,
         0.00135352, -0.00219292],
       [-0.01303557, -0.00146097, -0.00506871, ...,  0.01258362,
        -0.00579549, -0.01278685],
       [-0.00729411, -0.00982529, -0.01187047, ...,  0.01411375,
        -0.01183319, -0.00734779]])

In [434]:
rbf_kernel.transform(feat_test_mat)

array([[ 0.01158714, -0.01357454,  0.00763057, ..., -0.00773793,
         0.00022349, -0.00511733],
       [ 0.01413437,  0.01396206,  0.00843097, ...,  0.01198258,
         0.01410292, -0.00505409],
       [-0.01108782, -0.00990463,  0.01413618, ...,  0.01410968,
         0.01140916,  0.01239361],
       ...,
       [ 0.00422473,  0.0122155 ,  0.01070024, ..., -0.00549259,
         0.00658475,  0.01164897],
       [-0.00915065, -0.00669379,  0.01282676, ...,  0.00559994,
         0.0096538 ,  0.01343743],
       [ 0.00738116, -0.00975196,  0.01136088, ..., -0.00343837,
         0.01059531,  0.01412997]])

### Logistic regression

In [383]:
tstart = time.time()

# logit_lin = LogisticRegression(C=1, fit_intercept=False, solver="sag", max_iter=1000).fit(feat_train_mat, labl_train.iloc[:, 1])
logit_lin = SGDClassifier(loss="log", fit_intercept=False, max_iter=100, alpha=1, tol=1e-7).fit(feat_train_mat, labl_train.iloc[:, 11])

tend_lin = time.time()
print("Linear classifier training time: {}".format(tend_lin - tstart))

# logit_nonlin = LogisticRegression(C=1, fit_intercept=False, solver="sag", max_iter=500).fit(X_rbf_feat, labl_train.iloc[:, 11])
logit_nonlin = SGDClassifier(loss="log", fit_intercept=False, max_iter=500, tol=1e-7).fit(X_rbf_feat, labl_train.iloc[:, 11])

tend_nonlin = time.time()
print("Nonlinear kernels training time: {}".format(tend_nonlin - tstart))

Linear classifier training time: 0.9513750076293945
Nonlinear kernels training time: 193.39767217636108


In [384]:
prob_lin = logit_lin.predict_proba(feat_train_mat)[:, 1]
prob_nonlin = logit_nonlin.predict_proba(X_rbf_feat)[:, 1]
prob_lin, prob_nonlin

(array([0.09374422, 0.05322627, 0.05382918, ..., 0.07817172, 0.05483041,
        0.05807976]),
 array([0.45822555, 0.56198689, 0.42468296, ..., 0.47526155, 0.45917428,
        0.4713796 ]))

In [386]:
metrics.roc_auc_score(labl_train.iloc[:, 11], prob_lin), metrics.roc_auc_score(labl_train.iloc[:, 11], prob_nonlin)

(0.5178642040247159, 0.8580173933788627)

### Support-Vector-Machine classification

In [363]:
svm_lin = LinearSVC(fit_intercept=False).fit(feat_train_mat, labl_train.iloc[:, 11])
svm_nonlin = LinearSVC(fit_intercept=False).fit(X_rbf_feat, labl_train.iloc[:, 11])



In [364]:
func_lin = svm_lin.decision_function(feat_train_mat)
prob_svm_lin = 1/(1 + np.exp(-func_lin))

In [367]:
metrics.roc_auc_score(labl_train.iloc[:, 11], prob_svm_lin)

0.6381671930792757

In [52]:
vital_predictor = Lasso(alpha=1, fit_intercept=True).fit(feat_na_filled, labl_train["LABEL_RRate"])

In [53]:
0.5 + 0.5*np.maximum(0, metrics.r2_score(labl_train["LABEL_RRate"], vital_predictor.predict(feat_na_filled)))

0.636415255250034

In [None]:
logistic_rbf = Pipeline([("rbf_approx", RBFSampler(gamma=2, n_components=1, random_state=1)), 
                         ("sgd_log", SGDClassifier(loss="log", fit_intercept=False, max_iter=500))])
logistic_rbf.fit(

---

## Checking Results

In [429]:
result_prev = pd.read_csv("./recall_test_rbf.zip")
result_prev

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.378,0.157,0.406,0.400,0.405,0.391,0.078,0.264,0.116,0.109,0.418,17.386,78.240,97.199,75.166
1,3,0.135,0.066,0.268,0.277,0.287,0.177,0.262,0.183,0.055,0.049,0.448,17.237,80.472,96.642,82.850
2,5,0.138,0.049,0.229,0.222,0.225,0.093,0.145,0.114,0.035,0.028,0.540,19.670,73.858,96.935,75.697
3,7,0.856,0.994,1.000,1.000,1.000,0.930,0.244,0.913,0.736,0.049,0.590,18.233,85.132,96.347,96.413
4,9,0.283,0.061,0.177,0.161,0.182,0.163,0.012,0.092,0.012,0.002,0.522,19.558,83.456,96.473,88.830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,31647,0.145,0.069,0.226,0.221,0.238,0.096,0.030,0.090,0.032,0.010,0.516,14.917,79.032,98.058,77.399
12660,31649,0.546,0.032,0.178,0.201,0.188,0.374,0.110,0.241,0.014,0.025,0.444,18.366,82.265,95.621,93.069
12661,31651,0.748,0.059,0.104,0.088,0.088,0.340,0.016,0.379,0.023,0.012,0.559,18.458,68.656,96.456,86.196
12662,31652,0.061,0.032,0.301,0.297,0.269,0.108,0.090,0.127,0.039,0.023,0.488,19.323,89.328,96.938,112.622


In [438]:
feat_test_fill0

Measure,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,EtCO2,PTT,PTT,PTT,PTT,PTT,PTT,PTT,PTT,...,ABPs,ABPs,ABPs,ABPs,ABPs,ABPs,ABPs,pH,pH,pH,pH,pH,pH,pH,pH,pH,pH,pH,pH,Age
Time,0,1,2,3,4,5,6,7,8,9,10,11,0,1,2,3,4,5,6,7,...,5,6,7,8,9,10,11,0,1,2,3,4,5,6,7,8,9,10,11,0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.2,44.2,43.06,41.92,40.78,39.64,38.5,38.5,...,144.0,136.5,129.0,121.0,120.0,120.0,120.0,7.34,7.34,7.340,7.352,7.364,7.376,7.388,7.40,7.40,7.40,7.40,7.40,39.0
10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,110.0,112.0,108.0,115.0,122.0,117.0,117.0,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.00,0.00,0.00,0.00,0.00,62.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,124.0,124.0,118.0,126.0,146.0,127.0,114.0,7.52,7.52,7.520,7.400,7.400,7.400,7.400,7.40,7.40,7.40,7.40,7.40,72.0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,110.0,93.0,104.0,99.0,89.0,91.0,115.0,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.00,0.00,0.00,0.00,0.00,44.0
10005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.9,39.9,39.90,39.90,39.90,39.90,39.9,39.9,...,134.0,114.0,117.0,124.5,132.0,129.0,121.0,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.00,0.00,0.00,0.00,0.00,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.4,26.4,26.40,26.40,26.40,26.40,26.4,26.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.00,0.00,0.00,0.00,0.00,43.0
9991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.00,50.00,50.00,50.00,50.0,50.0,...,152.0,143.0,149.0,136.5,124.0,125.5,127.0,7.33,7.33,7.330,7.330,7.350,7.370,7.390,7.39,7.39,7.39,7.39,7.39,62.0
9992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.3,41.3,41.30,41.30,41.30,41.30,41.3,41.3,...,112.0,99.0,90.0,112.0,94.0,107.0,93.0,7.38,7.38,7.385,7.390,7.395,7.400,7.405,7.41,7.41,7.41,7.41,7.41,51.0
9994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,150.0,150.00,109.65,69.30,69.30,69.3,69.3,...,120.0,104.0,84.0,113.0,124.0,139.0,138.0,7.35,7.37,7.390,7.420,7.450,7.485,7.520,7.54,7.50,7.52,7.54,7.54,52.0


In [451]:
X_rbf_feat[1,:] @ X_rbf_feat[1,:]

0.9918226790121505

In [449]:
np.exp(-sum(abs(X_rbf_feat[0,:] - X_rbf_feat[1,:])**2)/4)

0.6036127985948873

In [426]:
feat_test_mat = np.concatenate((feat_test_fill0.to_numpy(), np.ones((feat_test_reformed.shape[0], 1))), axis=1)
X_rbf_feat_test = rbf_kernel.transform(feat_test_mat)

prob_test = logit_nonlin.predict_proba(X_rbf_feat_test)[:, 1]
prob_sorted = prob_test[feat_test_fill0.index.argsort()]
result_prev["LABEL_Sepsis"] = prob_sorted

In [423]:
result_prev.to_csv("./recall_test_rbf.zip", compression="zip", index=False, float_format='%.3f')

In [404]:
utils.get_score("./train_labels.csv", "./recall_train_rbf.zip")

[0.880140104158019, 0.7682702066333779, 0.7163509452862231, 0.7200754850470517, 0.7163416600616637, 0.7564771589791436, 0.7676717431220972, 0.8042015344518387, 0.7284277753577496, 0.9077601581932911]
[0.6519184425430589, 0.7291738267644755, 0.5417486006038639, 0.7896502307334095]
0.7765716771290455 0.8580173933788627 0.6781227751612019


0.77090394855637

In [435]:
X_rbf_feat_test

array([[ 0.01158714, -0.01357454,  0.00763057, ..., -0.00773793,
         0.00022349, -0.00511733],
       [ 0.01413437,  0.01396206,  0.00843097, ...,  0.01198258,
         0.01410292, -0.00505409],
       [-0.01108782, -0.00990463,  0.01413618, ...,  0.01410968,
         0.01140916,  0.01239361],
       ...,
       [ 0.00422473,  0.0122155 ,  0.01070024, ..., -0.00549259,
         0.00658475,  0.01164897],
       [-0.00915065, -0.00669379,  0.01282676, ...,  0.00559994,
         0.0096538 ,  0.01343743],
       [ 0.00738116, -0.00975196,  0.01136088, ..., -0.00343837,
         0.01059531,  0.01412997]])

In [436]:
pd.read_csv("./recall_test_rbf.zip")

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.378,0.157,0.406,0.400,0.405,0.391,0.078,0.264,0.116,0.109,0.418,17.386,78.240,97.199,75.166
1,3,0.135,0.066,0.268,0.277,0.287,0.177,0.262,0.183,0.055,0.049,0.448,17.237,80.472,96.642,82.850
2,5,0.138,0.049,0.229,0.222,0.225,0.093,0.145,0.114,0.035,0.028,0.540,19.670,73.858,96.935,75.697
3,7,0.856,0.994,1.000,1.000,1.000,0.930,0.244,0.913,0.736,0.049,0.590,18.233,85.132,96.347,96.413
4,9,0.283,0.061,0.177,0.161,0.182,0.163,0.012,0.092,0.012,0.002,0.522,19.558,83.456,96.473,88.830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,31647,0.145,0.069,0.226,0.221,0.238,0.096,0.030,0.090,0.032,0.010,0.516,14.917,79.032,98.058,77.399
12660,31649,0.546,0.032,0.178,0.201,0.188,0.374,0.110,0.241,0.014,0.025,0.444,18.366,82.265,95.621,93.069
12661,31651,0.748,0.059,0.104,0.088,0.088,0.340,0.016,0.379,0.023,0.012,0.559,18.458,68.656,96.456,86.196
12662,31652,0.061,0.032,0.301,0.297,0.269,0.108,0.090,0.127,0.039,0.023,0.488,19.323,89.328,96.938,112.622


In [420]:
metrics.roc_auc_score(labl_train.iloc[:, 11], logit_nonlin.predict_proba(RBFSampler(gamma=2, n_components=10000, random_state=1).fit_transform(feat_train_mat[:, :-1]))[:, 1])

0.4954871513440357

In [417]:
prob_sorted

array([[0.58192112, 0.41807888],
       [0.55239956, 0.44760044],
       [0.46012484, 0.53987516],
       ...,
       [0.44097041, 0.55902959],
       [0.51190013, 0.48809987],
       [0.57553523, 0.42446477]])