In [1]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [2]:
os.chdir('..')

In [3]:
df = pd.read_csv('toy_data/toydata.csv')

In [4]:
df

Unnamed: 0,t_idx,time_bin,uid,age,temperature,heart_rate,systolic_blood_pressure,diastolic_blood_pressure,resp_rate,glucose,...,paco2,chloride,troponin,ptt,lactate,blood_urea_nitrogen,magnesium,y,event_relative_time,event_order
0,0,3558,uva_1880,63.763062,60.0,60.121516,100.0,80.0,23.318647,150.0,...,26.8,117.0,0.07,48.9,0.5,106.0,2.0,0,-2865,1
1,1,3559,uva_1880,63.763062,60.0,60.121516,100.0,80.0,23.318647,150.0,...,26.8,117.0,0.07,48.9,0.5,106.0,2.0,0,-2850,1
2,2,3560,uva_1880,63.763062,60.0,60.121516,100.0,80.0,23.318647,150.0,...,26.8,117.0,0.07,48.9,0.5,106.0,2.0,0,-2835,1
3,3,3561,uva_1880,63.763062,60.0,60.121516,100.0,80.0,23.318647,150.0,...,26.8,117.0,0.07,48.9,0.5,106.0,2.0,0,-2820,1
4,4,3562,uva_1880,63.763062,60.0,60.121516,100.0,80.0,23.318647,150.0,...,26.8,117.0,0.07,48.9,0.5,106.0,2.0,0,-2805,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395452,188,25691,mimic_89645,60.000000,,82.000000,100.0,55.0,27.000000,124.0,...,,103.0,,,,11.0,2.1,0,-45,6
395453,189,25692,mimic_89645,60.000000,,82.000000,100.0,55.0,27.000000,124.0,...,,103.0,,,,11.0,2.1,0,-30,6
395454,190,25693,mimic_89645,60.000000,,82.000000,100.0,55.0,27.000000,124.0,...,,103.0,,,,11.0,2.1,0,-15,6
395455,191,25694,mimic_89645,60.000000,,82.000000,100.0,55.0,27.000000,124.0,...,,103.0,,,,11.0,2.1,0,0,6


In [48]:
train = df.groupby('uid').agg(
    {
        'age': 'mean',
        'temperature': 'mean',
        'heart_rate': 'mean',
        'systolic_blood_pressure': 'mean',
        'diastolic_blood_pressure': 'mean',
        'resp_rate': 'mean',
        'y': lambda x: 1 if sum(x) > 0 else 0
    }
)

y_train = train['y']
X_train = train.drop('y', axis=1)


In [49]:
from sklearn.impute import SimpleImputer

In [50]:
X_train: np.ndarray = SimpleImputer(strategy='mean').fit_transform(X_train)

In [51]:
y_train = y_train.to_numpy()

In [63]:
from sklearn.linear_model import LogisticRegressionCV

In [64]:
regr = LogisticRegressionCV(cv=5)

In [65]:
regr.fit(X_train, y_train)

LogisticRegressionCV(cv=5)

In [66]:
eval_metrics(y_train, regr.predict(X_train))

(0.2991669443391063, 0.08950086058519793, -0.09829867674858206)

In [67]:
regr.score(X_train, y_train)

0.9104991394148021