# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Start

In [3]:
data_url = 'https://raw.githubusercontent.com/Hospital-Da-Luz-Learning-Health/MLCatolica24/main/Aula%208%20-%20Metrics%20%26%20Model%20Optimization/data/liver_disease.csv'
df = pd.read_csv(data_url, index_col = 'id')
df.head(3)
df.shape

Unnamed: 0_level_0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
572,75,1,32.960731,10.911642,0,0,8.930014,0,1,89.950963,1
125,20,1,30.298513,9.417347,0,0,1.484017,0,0,75.777573,1
1553,21,1,33.230869,1.685287,1,1,3.411027,0,0,69.226081,1


(804, 11)

In [4]:
X = df.drop(columns='Diagnosis')
y = df['Diagnosis']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42, stratify=y)

# Logistic Regression

Lookup the parameters in the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). Or ask chatgpt to help you build a parameter grid for logistic regression from sklearn

In [6]:
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced', None],
    'solver': ['liblinear']
}

In [7]:
grid = GridSearchCV(LogisticRegression(), # your model
                    param_grid, #the parameter grid
                    cv=3, #how many folds do you want in your cross-validation
                    scoring='f1', # what scoring metric do you want. More here: https://scikit-learn.org/stable/modules/model_evaluation.html
                    verbose=1, # to show the messages during training
                    n_jobs=-1 # how many cores to use in your computer
                    )

In [8]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits




In [9]:
results = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
results.head(5)
results.shape

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
49,0.001018,4.6e-05,0.000704,1.9e-05,11.288379,balanced,l2,liblinear,"{'C': 11.288378916846883, 'class_weight': 'bal...",0.318182,0.352941,0.384615,0.351913,0.027131,1
45,0.002224,0.000794,0.001629,0.000628,4.281332,balanced,l2,liblinear,"{'C': 4.281332398719396, 'class_weight': 'bala...",0.341463,0.339623,0.37037,0.350485,0.014081,2
53,0.003179,0.000951,0.001791,0.000246,29.763514,balanced,l2,liblinear,"{'C': 29.763514416313132, 'class_weight': 'bal...",0.318182,0.36,0.37037,0.349517,0.022558,3
44,0.007166,0.000914,0.001409,0.000549,4.281332,balanced,l1,liblinear,"{'C': 4.281332398719396, 'class_weight': 'bala...",0.311111,0.36,0.37037,0.34716,0.02584,4
40,0.005963,0.001568,0.001041,3.7e-05,1.623777,balanced,l1,liblinear,"{'C': 1.623776739188721, 'class_weight': 'bala...",0.325581,0.326531,0.384615,0.345576,0.027608,5


(80, 15)

## Interpreting what features made the best models (optional)

In [10]:
features = results.filter(like='param_')
features = pd.get_dummies(features)
target = results['mean_test_score']

In [11]:
features.head(3)

Unnamed: 0,param_C,param_class_weight_balanced,param_penalty_l1,param_penalty_l2,param_solver_liblinear
49,11.288379,True,False,True,True
45,4.281332,True,False,True,True
53,29.763514,True,False,True,True


In [12]:
target.head(3)

49    0.351913
45    0.350485
53    0.349517
Name: mean_test_score, dtype: float64

### Using Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(features, target)

pd.Series(data=model.coef_, index=features.columns).sort_values(ascending=False).to_frame('Coeficient Value')

Unnamed: 0,Coeficient Value
param_class_weight_balanced,0.142681
param_penalty_l1,0.001231
param_C,2.1e-05
param_solver_liblinear,0.0
param_penalty_l2,-0.001231


Seems that only the hyperparameter `class_weight` set as `balanced` linearly affects the model (in this case, positively)

### Using Decision Tree

In [14]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=3)

model.fit(features, target)

pd.Series(data=model.feature_importances_, index=features.columns).sort_values(ascending=False).to_frame('Feature Importance')

Unnamed: 0,Feature Importance
param_C,0.698007
param_class_weight_balanced,0.301993
param_penalty_l1,0.0
param_penalty_l2,0.0
param_solver_liblinear,0.0


# Decision Tree

Lookup the parameters in the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html). Or ask chatgpt to help you build a parameter grid for logistic regression from sklearn

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [17]:
grid = GridSearchCV(DecisionTreeClassifier(), # your model
                    param_grid, #the parameter grid
                    cv=3, #how many folds do you want in your cross-validation
                    scoring='f1', # what scoring metric do you want. More here: https://scikit-learn.org/stable/modules/model_evaluation.html
                    verbose=1, # to show the messages during training
                    n_jobs=-1 # how many cores to use in your computer
                    )

In [18]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/simaonovais/miniconda3/envs/MLCatolica

In [19]:
results = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
results.head(5)
results.shape

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
76,0.000636,6.9e-05,0.000857,9.7e-05,7,log2,2,5,"{'max_depth': 7, 'max_features': 'log2', 'min_...",0.363636,0.428571,0.285714,0.359307,0.058401,1
19,0.0066,0.008519,0.005671,0.007024,3,log2,1,5,"{'max_depth': 3, 'max_features': 'log2', 'min_...",0.210526,0.428571,0.363636,0.334245,0.091411,2
46,0.000571,8e-06,0.000695,7e-06,5,log2,1,5,"{'max_depth': 5, 'max_features': 'log2', 'min_...",0.190476,0.347826,0.285714,0.274672,0.064711,3
96,0.000817,5.9e-05,0.000812,2.1e-05,9,sqrt,4,2,"{'max_depth': 9, 'max_features': 'sqrt', 'min_...",0.285714,0.166667,0.363636,0.272006,0.080995,4
99,0.000754,3.3e-05,0.000782,4.3e-05,9,log2,1,2,"{'max_depth': 9, 'max_features': 'log2', 'min_...",0.166667,0.363636,0.285714,0.272006,0.080995,4


(108, 15)

# Picking the best model

In [20]:
best_model = grid.best_estimator_

Using that model to estimate performance on a test set

In [21]:
y_pred = best_model.predict(X_test)

f1 = best_model.score(X_test, y_test)

In [22]:
f1

0.9397163120567376

# RandomSearch

When your dataset is too large, searching a very large hyperparameter space becomes unfeasable

In [23]:
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [24]:
# init random search
grid = RandomizedSearchCV(DecisionTreeClassifier(), # your model
                    param_grid, #the parameter grid
                    cv=3, #how many folds do you want in your cross-validation
                    scoring='f1', # what scoring metric do you want. More here: https://scikit-learn.org/stable/modules/model_evaluation.html
                    verbose=1, # to show the messages during training
                    n_jobs=-1, # how many cores to use in your computer
                    n_iter=20, # how many iterations do you want to do,
                    random_state=40
                    )

In [25]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


18 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/

In [26]:
results = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
results.head(5)
results.shape

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
17,0.001122,0.000486,0.001406,0.000364,2,4,log2,5,"{'min_samples_split': 2, 'min_samples_leaf': 4...",0.352941,0.0,0.363636,0.238859,0.168955,1
3,0.001063,0.000332,0.001039,0.000198,2,2,sqrt,9,"{'min_samples_split': 2, 'min_samples_leaf': 2...",0.166667,0.4,0.0,0.188889,0.164054,2
0,0.000936,0.000101,0.001064,0.000263,2,4,sqrt,3,"{'min_samples_split': 2, 'min_samples_leaf': 4...",0.166667,0.0,0.363636,0.176768,0.148626,3
15,0.000635,3e-05,0.000745,2e-05,2,2,log2,5,"{'min_samples_split': 2, 'min_samples_leaf': 2...",0.0,0.333333,0.142857,0.15873,0.136545,4
14,0.000989,0.000306,0.000811,5.2e-05,2,1,log2,5,"{'min_samples_split': 2, 'min_samples_leaf': 1...",0.133333,0.117647,0.222222,0.157734,0.046047,5


(20, 15)