# Modeling
___

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn import clone
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import time

Importing data

In [2]:
X_train = pd.read_csv('../cleaned_data/X_train.csv')
y_train = pd.read_csv('../cleaned_data/y_train.csv')

X_test = pd.read_csv('../cleaned_data/X_test.csv')
y_test = pd.read_csv('../cleaned_data/y_test.csv')

X_val = pd.read_csv('../cleaned_data/X_val.csv')
y_val = pd.read_csv('../cleaned_data/y_val.csv')

Below I encountered a problem where y wasnt a 1d array, which it should be for the machine learn training, I will therefore use ravel to chenge the array from 2d to 1d. 

In [3]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()
y_val = y_val.values.ravel()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (4637, 60)
y_train shape: (4637, 1)
X_test shape: (1546, 60)
y_test shape: (1546, 1)
X_val shape: (1546, 60)
y_val shape: (1546, 1)
X_train shape: (4637, 60)
y_train shape: (4637,)
X_test shape: (1546, 60)
y_test shape: (1546,)
X_val shape: (1546, 60)
y_val shape: (1546,)


#### What needs to be imputed
___

First we will check what we need to impute

In [4]:
nan_counts = X_train.isnull().sum()
nan_percentage = (nan_counts / len(X_train)) * 100
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Percentage': nan_percentage
})

nan_summary = nan_summary[nan_summary['NaN Count'] > 0]

print(nan_summary)

                               NaN Count  Percentage
utdanning                            845   18.222989
hvite_blodlegemer                    112    2.415355
lungefunksjon                       1204   25.965064
serumalbumin                        1725   37.200776
bilirubin                           1321   28.488247
kreatinin                             32    0.690101
blod_ph                             1182   25.490619
glukose                             2304   49.687298
blodurea_nitrogen                   2219   47.854216
urinmengde                          2476   53.396593
adl_pasient                         2885   62.216951
adl_stedfortreder                   1462   31.529006
lege_overlevelsesestimat_2mnd        840   18.115161
lege_overlevelsesestimat_6mnd        832   17.942635
dnr_dag                             4002   86.305801


#### Baseline model
___

We will use use a baseline model to compare the RMSE values, for this I will use a DummyRegressor, I will try two differnt impuation techniques, median and mean. 

In [5]:
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')

mean_pipeline = Pipeline(steps=[('imputer', mean_imputer), ('scaler', StandardScaler()), ('regressor', DummyRegressor())])
mean_pipeline.fit(X_train, y_train)
y_pred_mean = mean_pipeline.predict(X_val)
baseline_rmse_mean = (root_mean_squared_error(y_val, y_pred_mean))
print(f"Baseline RMSE (Mean Imputer): {baseline_rmse_mean:.2f}")

median_pipeline = Pipeline(steps=[('imputer', median_imputer), ('scaler', StandardScaler()), ('regressor', DummyRegressor())])
median_pipeline.fit(X_train, y_train)
y_pred_median = median_pipeline.predict(X_val)
baseline_rmse_median = (root_mean_squared_error(y_val, y_pred_median))
print(f"Baseline RMSE (Median Imputer): {baseline_rmse_median:.2f}")

Baseline RMSE (Mean Imputer): 20.83
Baseline RMSE (Median Imputer): 20.83


#### Imputers and Machine Learning Models
___

In [None]:
imputers = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
    'mode': SimpleImputer(strategy='most_frequent'),
    'knn5': KNNImputer(n_neighbors=5),
    'knn10': KNNImputer(n_neighbors=10)
}

models = {
    'Linear': LinearRegression(),
    'ElasticNet0.5': ElasticNet(alpha=0.5),
    'ElasticNet1': ElasticNet(alpha=1),
    'KNN5': KNeighborsRegressor(n_neighbors=5),
    'KNN10': KNeighborsRegressor(n_neighbors=10),
    'FullRandForest': RandomForestRegressor(),
    'SmallRandForest': RandomForestRegressor(n_estimators=50, max_depth=10),
    'SVR_poly_C1': SVR(kernel='poly', degree=2, C=1),
    'SVR_poly_C10': SVR(kernel='poly', degree=2, C=10),
    'SVR_rbf_C1': SVR(kernel='rbf', C=1, gamma='scale'),
    'SVR_rbf_C10': SVR(kernel='rbf', C=10, gamma='auto')
}

cont_features = ['blodtrykk', 'hvite_blodlegemer', 'hjertefrekvens', 'respirasjonsfrekvens', 'kroppstemperatur', 'lungefunksjon', 'serumalbumin', 'bilirubin', 'kreatinin', 'natrium', 'blod_ph', 'glukose', 'blodurea_nitrogen', 'urinmengde', 'koma_score', 'fysiologisk_score', 'apache_fysiologisk_score', 'overlevelsesestimat_2mnd', 'overlevelsesestimat_6mnd']
cat_features = [feat for feat in X_train.columns if feat not in cont_features]

#### Defining HyperParameters Tuning Space
___

In [7]:
param_grid = {
    'cont_imputer': list(imputers.keys()),
    'cat_imputer': list(imputers.keys()),
    'model': list(models.keys()),
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'learning_rate': [0.01, 0.1, 1],
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1],
    'alpha': [0.1, 1, 10],
    'l1_ratio': [0.1, 0.5, 0.9] 
}

#### Pipeline
___

We will create a pipeline and objective function for the tuning of hyperParameters, we test each cont_imputation and cat_imputation method together with each meachine models to find the best combination. 

In [8]:
results = []

def create_pipeline(cont_imputer, cat_imputer, model):
    preprocess = ColumnTransformer(transformers=[
        ('num', Pipeline(steps=[('impute', clone(cont_imputer)), ('scaler', StandardScaler())]), X_train.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', Pipeline(steps=[('impute', clone(cat_imputer))]), X_train.select_dtypes(include=['object']).columns)
    ])
    
    pipeline = Pipeline([
        ('preprocess', preprocess),
        ('model', model)
    ])
    
    return pipeline

def objective(params):
    start_time = time.time()
    pipeline = create_pipeline(
        imputers[params['cont_imputer']],
        imputers[params['cat_imputer']],
        models[params['model']]
    )
    
    print(f"Running model: {params['model']} with imputers {params['cont_imputer']}, {params['cat_imputer']}")
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    rmse = (root_mean_squared_error(y_val, y_pred))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Model completed in {elapsed_time:.2f} seconds. RMSE: {rmse}")

    results.append({'params': params, 'rmse': rmse})
    
    return rmse

for cont_imputer in param_grid['cont_imputer']:
    for cat_imputer in param_grid['cat_imputer']:
        for model in param_grid['model']:
            params = {
                'cont_imputer': cont_imputer,
                'cat_imputer': cat_imputer,
                'model': model
            }
            objective(params)

results.sort(key=lambda x: x['rmse'])

best_result = results[0]
best_cont_imputer = imputers[best_result['params']['cont_imputer']]
best_cat_imputer = imputers[best_result['params']['cat_imputer']]
best_model = models[best_result['params']['model']]

best_pipeline = create_pipeline(best_cont_imputer, best_cat_imputer, best_model)
best_pipeline.fit(X_train, y_train)
y_pred_test = best_pipeline.predict(X_test)
test_rmse = (root_mean_squared_error(y_test, y_pred_test))
print("Test RMSE:", test_rmse)

Running model: Linear with imputers mean, mean


Model completed in 0.07 seconds. RMSE: 20.293455398177354
Running model: Polynomial2 with imputers mean, mean
Model completed in 0.02 seconds. RMSE: 20.293455398177354
Running model: ElasticNet0.5 with imputers mean, mean
Model completed in 0.02 seconds. RMSE: 20.24223606970142
Running model: ElasticNet1 with imputers mean, mean
Model completed in 0.02 seconds. RMSE: 20.332287576709536
Running model: KNN5 with imputers mean, mean
Model completed in 0.13 seconds. RMSE: 20.634906182715483
Running model: KNN10 with imputers mean, mean
Model completed in 0.03 seconds. RMSE: 19.98432697663162
Running model: FullRandForest with imputers mean, mean
Model completed in 6.23 seconds. RMSE: 19.059763924059492
Running model: SmallRandForest with imputers mean, mean
Model completed in 1.50 seconds. RMSE: 18.975986527127194
Running model: SVR_poly_C1 with imputers mean, mean
Model completed in 0.86 seconds. RMSE: 21.07376654840994
Running model: SVR_poly_C10 with imputers mean, mean
Model completed 

Top ten models

In [9]:
print("Top 10 Best Models and their RMSEs:")
for i, result in enumerate(results[:10]):
    print(f"Rank {i+1}: Parameters - {result['params']}, RMSE - {result['rmse']:.2f}")

print("Test RMSE of the Best Model:", test_rmse)

Top 10 Best Models and their RMSEs:
Rank 1: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'knn10', 'model': 'SmallRandForest'}, RMSE - 18.85
Rank 2: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'median', 'model': 'SmallRandForest'}, RMSE - 18.92
Rank 3: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'mode', 'model': 'SmallRandForest'}, RMSE - 18.95
Rank 4: Parameters - {'cont_imputer': 'knn10', 'cat_imputer': 'knn10', 'model': 'SmallRandForest'}, RMSE - 18.96
Rank 5: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'mean', 'model': 'SmallRandForest'}, RMSE - 18.98
Rank 6: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'knn5', 'model': 'SmallRandForest'}, RMSE - 19.00
Rank 7: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'mean', 'model': 'FullRandForest'}, RMSE - 19.06
Rank 8: Parameters - {'cont_imputer': 'mode', 'cat_imputer': 'knn10', 'model': 'SmallRandForest'}, RMSE - 19.09
Rank 9: Parameters - {'cont_imputer': 'mean', 'cat_imputer': 'knn10', '

Best pipeline

In [10]:
best_pipeline