In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

sns.set_theme()

from lab2 import Helper

In [2]:
df1 = pd.read_csv('Data/df1.csv')
df2 = pd.read_csv('Data/df2.csv')

display(df1.head(3), df2.head(3))

Unnamed: 0,id,age,cholesterol,gluc,smoke,alco,active,cardio,gender_2,bmi-feature_obese (class I),bmi-feature_obese (class II),bmi-feature_obese (class III),bmi-feature_overweight,bp-feature_healthy,bp-feature_hypertension crises,bp-feature_stage 1 hypertension,bp-feature_stage 2 hypertension
0,0,18393,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0
1,1,20228,3,1,0,0,1,1,0,1,0,0,0,0,0,0,1
2,2,18857,3,1,0,0,0,1,0,0,0,0,0,0,0,1,0


Unnamed: 0,id,age,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2
0,0,18393,21,110,80,1,1,0,0,1,0,1
1,1,20228,34,140,90,3,1,0,0,1,1,0
2,2,18857,23,130,70,3,1,0,0,0,1,0


In [3]:
# Drop id as its not useful
df1 = df1.drop('id', axis=1)
df2 = df2.drop('id', axis=1)

### 2.4 - Välja modell

Chosen models:
* Logistic regression
* decision tree
* random forest

##### train|validation|test split

In [4]:
df1_x, df1_y = df1.drop('cardio', axis=1), df1['cardio']
df2_x, df2_y = df2.drop('cardio', axis=1), df2['cardio']

display(
    df1_x.head(1),
    df2_x.head(1)
)

Unnamed: 0,age,cholesterol,gluc,smoke,alco,active,gender_2,bmi-feature_obese (class I),bmi-feature_obese (class II),bmi-feature_obese (class III),bmi-feature_overweight,bp-feature_healthy,bp-feature_hypertension crises,bp-feature_stage 1 hypertension,bp-feature_stage 2 hypertension
0,18393,1,1,0,0,1,1,0,0,0,0,0,0,1,0


Unnamed: 0,age,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,gender_2
0,18393,21,110,80,1,1,0,0,1,1


In [5]:
df1_x_train, df1_x_val, df1_x_test, df1_y_train, df1_y_val, df1_y_test = Helper.train_val_test_split(df1_x, df1_y, split_size=0.2, rand_state=42)
df2_x_train, df2_x_val, df2_x_test, df2_y_train, df2_y_val, df2_y_test = Helper.train_val_test_split(df2_x, df2_y, split_size=0.2, rand_state=42)

display(
    (df1_x_train.shape, df1_y_train.shape, df1_x_val.shape, df1_y_val.shape, df1_x_test.shape, df1_y_test.shape),
    (df2_x_train.shape, df2_y_train.shape, df2_x_val.shape, df2_y_val.shape, df2_x_test.shape, df2_y_test.shape),
)

((40918, 15), (40918,), (13640, 15), (13640,), (13640, 15), (13640,))

((40918, 10), (40918,), (13640, 10), (13640,), (13640, 10), (13640,))

In [6]:
df1_x_train, df1_x_val = Helper.scaler('minmax', df1_x_train, df1_x_val)
df1_x_train.mean(), df1_x_train.std(), df1_x_val.mean(), df1_x_val.std()

(0.24469073517266024,
 0.4111901139656925,
 0.24653791612942894,
 0.4122246262036306)

In [None]:
"""
Logistic regression
    Parameters
        penalty: elasticnet
        solver: saga

    Hyperparamters
        max_iter
        l1_ratio
    

Grid Search parameters
    scoring
    refit
    cv
"""

In [None]:
model_data = {
    'logistic_regression': {
        'model': LogisticRegression(),
        'search space': {
            # Parameters
            'penalty': ['elasticnet'],
            'solver': ['saga'],
            
            # Hyperparamaters
            "max_iter" : [10, 100],
            "l1_ratio" : [0.1, 0.3]
        }
    }
}

In [28]:
# dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weig

In [26]:
# The holy grail. behold! the loop of all loops
for scaler_name in ['standard', 'minmax']:
    
    df1_x_train_scaled, df1_x_val_scaled = Helper.scaler(scaler_name, df1_x_train, df1_x_val)

    for model_name, data in model_data.items():
        
        GS = GridSearchCV(
            estimator = data['model'],
            param_grid = data['search space'],
            n_jobs=2,
            scoring = ["r2", "neg_root_mean_squared_error"],
            refit = "r2",
            cv = 5,
            verbose = 0,
        )
        
        GS.fit(df1_x_train_scaled,  df1_y_train)

fff = pd.DataFrame(GS.cv_results_)

fff



LogisticRegression(l1_ratio=0.3, max_iter=10, penalty='elasticnet',
                   solver='saga')
LogisticRegression(l1_ratio=0.1, penalty='elasticnet', solver='saga')


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_l1_ratio,param_max_iter,param_penalty,param_solver,params,split0_test_r2,...,std_test_r2,rank_test_r2,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,split3_test_neg_root_mean_squared_error,split4_test_neg_root_mean_squared_error,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error
0,0.427915,0.027508,0.007198,0.00655,0.1,10,elasticnet,saga,"{'l1_ratio': 0.1, 'max_iter': 10, 'penalty': '...",-0.218056,...,0.027573,4,-0.551812,-0.554683,-0.554903,-0.547511,-0.537941,-0.54937,0.006307,4
1,0.523862,0.075146,0.001799,0.000747,0.1,100,elasticnet,saga,"{'l1_ratio': 0.1, 'max_iter': 100, 'penalty': ...",-0.214146,...,0.026647,1,-0.550926,-0.554243,-0.554683,-0.547399,-0.538054,-0.549061,0.006097,1
2,0.271159,0.015539,0.001801,0.000401,0.3,10,elasticnet,saga,"{'l1_ratio': 0.3, 'max_iter': 10, 'penalty': '...",-0.215124,...,0.026859,3,-0.551147,-0.554463,-0.555014,-0.547176,-0.538282,-0.549216,0.006142,3
3,0.489393,0.042469,0.002001,0.000632,0.3,100,elasticnet,saga,"{'l1_ratio': 0.3, 'max_iter': 100, 'penalty': ...",-0.214146,...,0.026381,2,-0.550926,-0.554243,-0.554573,-0.547399,-0.538168,-0.549062,0.006035,2


In [9]:
model = LogisticRegression()

search_space = {
    # Parameters
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    
    # Hyperparamaters
    "max_iter" : [10, 100],
    "l1_ratio" : [0.1, 0.3]
}

GS = GridSearchCV(
    estimator = model,
    param_grid = search_space,
    scoring = ["r2", "neg_root_mean_squared_error"], #sklearn.metrics.SCORERS.keys()
    refit = "r2",
    cv = 5,
    verbose = 4,
)

GS.fit(df1_x_train,  df1_y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[CV 1/5] END l1_ratio=0.1, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.551) r2: (test=-0.213) total time=   0.1s




[CV 2/5] END l1_ratio=0.1, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.554) r2: (test=-0.229) total time=   0.1s




[CV 3/5] END l1_ratio=0.1, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.555) r2: (test=-0.231) total time=   0.1s
[CV 4/5] END l1_ratio=0.1, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.547) r2: (test=-0.197) total time=   0.1s




[CV 5/5] END l1_ratio=0.1, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.538) r2: (test=-0.158) total time=   0.1s
[CV 1/5] END l1_ratio=0.1, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.551) r2: (test=-0.215) total time=   0.2s
[CV 2/5] END l1_ratio=0.1, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.554) r2: (test=-0.229) total time=   0.2s
[CV 3/5] END l1_ratio=0.1, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.555) r2: (test=-0.231) total time=   0.3s
[CV 4/5] END l1_ratio=0.1, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.548) r2: (test=-0.199) total time=   0.3s
[CV 5/5] END l1_ratio=0.1, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.538) r2: (test=-0.158) total time=   0.3s




[CV 1/5] END l1_ratio=0.3, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.551) r2: (test=-0.215) total time=   0.1s




[CV 2/5] END l1_ratio=0.3, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.554) r2: (test=-0.229) total time=   0.2s




[CV 3/5] END l1_ratio=0.3, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.555) r2: (test=-0.230) total time=   0.1s




[CV 4/5] END l1_ratio=0.3, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.547) r2: (test=-0.198) total time=   0.2s




[CV 5/5] END l1_ratio=0.3, max_iter=10, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.538) r2: (test=-0.159) total time=   0.1s
[CV 1/5] END l1_ratio=0.3, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.551) r2: (test=-0.214) total time=   0.3s
[CV 2/5] END l1_ratio=0.3, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.554) r2: (test=-0.229) total time=   0.3s
[CV 3/5] END l1_ratio=0.3, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.555) r2: (test=-0.231) total time=   0.9s
[CV 4/5] END l1_ratio=0.3, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.547) r2: (test=-0.199) total time=   0.3s
[CV 5/5] END l1_ratio=0.3, max_iter=100, penalty=elasticnet, solver=saga; neg_root_mean_squared_error: (test=-0.538) r2: (test=-0.159) total time=   0.3s


