In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

sns.set_theme()

from lab2 import Helper

In [2]:
df1 = pd.read_csv('Data/df1.csv')
df2 = pd.read_csv('Data/df2.csv')

display(df1.head(3), df2.head(3))

Unnamed: 0,id,age,cholesterol,gluc,smoke,alco,active,cardio,gender_2,bmi-feature_obese (class I),bmi-feature_obese (class II),bmi-feature_obese (class III),bmi-feature_overweight,bp-feature_healthy,bp-feature_hypertension crises,bp-feature_stage 1 hypertension,bp-feature_stage 2 hypertension
0,0,18393,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0
1,1,20228,3,1,0,0,1,1,0,1,0,0,0,0,0,0,1
2,2,18857,3,1,0,0,0,1,0,0,0,0,0,0,0,1,0


Unnamed: 0,id,age,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2
0,0,18393,21,110,80,1,1,0,0,1,0,1
1,1,20228,34,140,90,3,1,0,0,1,1,0
2,2,18857,23,130,70,3,1,0,0,0,1,0


In [3]:
# Drop id as its not useful
df1 = df1.drop('id', axis=1)
df2 = df2.drop('id', axis=1)

### 2.4 - Välja modell

Chosen models:
* Logistic regression
* decision tree
* random forest

##### train|validation|test split

In [4]:
df1_x, df1_y = df1.drop('cardio', axis=1), df1['cardio']
df2_x, df2_y = df2.drop('cardio', axis=1), df2['cardio']

display(
    df1_x.head(1),
    df2_x.head(1)
)

Unnamed: 0,age,cholesterol,gluc,smoke,alco,active,gender_2,bmi-feature_obese (class I),bmi-feature_obese (class II),bmi-feature_obese (class III),bmi-feature_overweight,bp-feature_healthy,bp-feature_hypertension crises,bp-feature_stage 1 hypertension,bp-feature_stage 2 hypertension
0,18393,1,1,0,0,1,1,0,0,0,0,0,0,1,0


Unnamed: 0,age,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,gender_2
0,18393,21,110,80,1,1,0,0,1,1


In [5]:
df1_x_train, df1_x_val, df1_x_test, df1_y_train, df1_y_val, df1_y_test = Helper.train_val_test_split(df1_x, df1_y, split_size=0.2, rand_state=42)
df2_x_train, df2_x_val, df2_x_test, df2_y_train, df2_y_val, df2_y_test = Helper.train_val_test_split(df2_x, df2_y, split_size=0.2, rand_state=42)

display(
    (df1_x_train.shape, df1_y_train.shape, df1_x_val.shape, df1_y_val.shape, df1_x_test.shape, df1_y_test.shape),
    (df2_x_train.shape, df2_y_train.shape, df2_x_val.shape, df2_y_val.shape, df2_x_test.shape, df2_y_test.shape),
)

((40918, 15), (40918,), (13640, 15), (13640,), (13640, 15), (13640,))

((40918, 10), (40918,), (13640, 10), (13640,), (13640, 10), (13640,))

In [6]:
df1_x_train, df1_x_val = Helper.scaler('minmax', df1_x_train, df1_x_val)
df1_x_train.mean(), df1_x_train.std(), df1_x_val.mean(), df1_x_val.std()

(0.24469073517266024,
 0.4111901139656925,
 0.24653791612942894,
 0.4122246262036306)

In [None]:
"""
Logistic regression
    Parameters
        penalty: elasticnet
        solver: saga

    Hyperparamters
        max_iter [UNTUNED]
        l1_ratio [UNTUNED]
    

Grid Search parameters
    scoring
    cv [UNTUNED]
"""

In [None]:
# All the data needed for chosing the right model and doing hyperparameter tuning via GridSearchCV

model_data = {
    'logistic_regression': {
        'model': LogisticRegression(),
        'search space': {
            # Parameters
            'penalty': ['elasticnet'],
            'solver': ['saga'],
            
            # Hyperparamaters
            "max_iter" : [10, 100],
            "l1_ratio" : [0.1, 0.3]
        }
    }
}

In [43]:
models = {}

# Loop through and use the minmax and standard scaler
for scaler_name in ['minmax', 'standard']:
    
    # Scale the data
    df1_x_train_scaled, df1_x_val_scaled = Helper.scaler(scaler_name, df1_x_train, df1_x_val)

    # loop through all models. data = dict with model object and parameter info
    for model_name, data in model_data.items():
        
        # Search # find the best hyperparamters
        GS = GridSearchCV(
            estimator = data['model'],
            param_grid = data['search space'],
            n_jobs=2,
            scoring = 'accuracy',
            cv = 5,
            verbose = 0
        )
        
        # Save the model for later. Note: GS.fit fits and returns the model object.
        models[model_name] = GS.fit(df1_x_train_scaled,  df1_y_train)


# display(
#     GS.best_params_,
#     GS.best_score_
# )

#models.



array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
# använda GridSearchCV() och välja lämplig evalueringsmetric (accurancy)
# gör prediction på valideringsdata
# beräkna och spara evaluation score för ditt valda metric
# checka bästa parametrarna för respektive modell

In [45]:
models['logistic_regression'].predict(df1_x_val_scaled)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)