# Imports

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [10]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Start

In [11]:
data_url = 'https://raw.githubusercontent.com/Hospital-Da-Luz-Learning-Health/MLCatolica24/main/Aula%208%20-%20Metrics%20%26%20Model%20Optimization/data/liver_disease.csv'
df = pd.read_csv(data_url, index_col = 'id')
df.head(3)
df.shape

Unnamed: 0_level_0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
572,75,1,32.960731,10.911642,0,0,8.930014,0,1,89.950963,1
125,20,1,30.298513,9.417347,0,0,1.484017,0,0,75.777573,1
1553,21,1,33.230869,1.685287,1,1,3.411027,0,0,69.226081,1


(804, 11)

In [12]:
X = df.drop(columns='Diagnosis')
y = df['Diagnosis']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression

Lookup the parameters in the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). Or ask chatgpt to help you build a parameter grid for logistic regression from sklearn

In [14]:
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced', None],
    'solver': ['liblinear']
}

In [15]:
grid = GridSearchCV(LogisticRegression(), # your model
                    param_grid, #the parameter grid
                    cv=3, #how many folds do you want in your cross-validation
                    scoring='f1', # what scoring metric do you want. More here: https://scikit-learn.org/stable/modules/model_evaluation.html
                    verbose=1, # to show the messages during training
                    n_jobs=-1 # how many cores to use in your computer
                    )

In [16]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


In [17]:
results = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
results.head(5)
results.shape

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
41,0.003363,0.000818,0.002236,0.000524,1.623777,balanced,l2,liblinear,"{'C': 1.623776739188721, 'class_weight': 'bala...",0.315789,0.225806,0.511628,0.351075,0.119324,1
45,0.001859,0.000571,0.001853,0.000481,4.281332,balanced,l2,liblinear,"{'C': 4.281332398719396, 'class_weight': 'bala...",0.321429,0.175439,0.52381,0.340226,0.142842,2
49,0.005465,0.004546,0.002465,0.001449,11.288379,balanced,l2,liblinear,"{'C': 11.288378916846883, 'class_weight': 'bal...",0.296296,0.206897,0.511628,0.338274,0.127898,3
40,0.009724,0.004704,0.006869,0.00836,1.623777,balanced,l1,liblinear,"{'C': 1.623776739188721, 'class_weight': 'bala...",0.301887,0.206897,0.5,0.336261,0.122103,4
36,0.006769,0.00243,0.002873,0.001835,0.615848,balanced,l1,liblinear,"{'C': 0.615848211066026, 'class_weight': 'bala...",0.321429,0.175439,0.5,0.332289,0.132724,5


(80, 15)

## Interpreting what features made the best models (optional)

In [18]:
features = results.filter(like='param_')
features = pd.get_dummies(features)
target = results['mean_test_score']

In [19]:
features.head(3)

Unnamed: 0,param_C,param_class_weight_balanced,param_penalty_l1,param_penalty_l2,param_solver_liblinear
41,1.623777,True,False,True,True
45,4.281332,True,False,True,True
49,11.288379,True,False,True,True


In [20]:
target.head(3)

41    0.351075
45    0.340226
49    0.338274
Name: mean_test_score, dtype: float64

### Using Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(features, target)

pd.Series(data=model.coef_, index=features.columns).sort_values(ascending=False).to_frame('Coeficient Value')

Unnamed: 0,Coeficient Value
param_class_weight_balanced,0.169087
param_penalty_l2,0.002197
param_C,1.4e-05
param_solver_liblinear,0.0
param_penalty_l1,-0.002197


Seems that only the hyperparameter `class_weight` set as `balanced` linearly affects the model (in this case, positively)

### Using Decision Tree

In [22]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=3)

model.fit(features, target)

pd.Series(data=model.feature_importances_, index=features.columns).sort_values(ascending=False).to_frame('Feature Importance')

Unnamed: 0,Feature Importance
param_C,0.546751
param_class_weight_balanced,0.453249
param_penalty_l1,0.0
param_penalty_l2,0.0
param_solver_liblinear,0.0


# Decision Tree

Lookup the parameters in the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html). Or ask chatgpt to help you build a parameter grid for logistic regression from sklearn

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
# parameter grid for decision tree
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [25]:
grid = GridSearchCV(DecisionTreeClassifier(), # your model
                    param_grid, #the parameter grid
                    cv=3, #how many folds do you want in your cross-validation
                    scoring='f1', # what scoring metric do you want. More here: https://scikit-learn.org/stable/modules/model_evaluation.html
                    verbose=1, # to show the messages during training
                    n_jobs=-1 # how many cores to use in your computer
                    )

In [26]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
73 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/simaonovais/miniconda3/envs/MLCatolica24/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/simaonovais/miniconda3/envs/MLCatolica

In [27]:
results = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
results.head(5)
results.shape

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
107,0.000951,0.000102,0.000827,4e-05,9,log2,4,10,"{'max_depth': 9, 'max_features': 'log2', 'min_...",0.5,0.125,0.166667,0.263889,0.16782,1
68,0.001161,0.00046,0.001675,0.000642,7,sqrt,2,10,"{'max_depth': 7, 'max_features': 'sqrt', 'min_...",0.375,0.222222,0.111111,0.236111,0.108179,2
46,0.001237,0.000441,0.001504,0.000974,5,log2,1,5,"{'max_depth': 5, 'max_features': 'log2', 'min_...",0.375,0.2,0.117647,0.230882,0.107309,3
105,0.00115,0.00023,0.001404,0.000417,9,log2,4,2,"{'max_depth': 9, 'max_features': 'log2', 'min_...",0.352941,0.0,0.333333,0.228758,0.161954,4
50,0.001028,0.00045,0.000815,0.000102,5,log2,2,10,"{'max_depth': 5, 'max_features': 'log2', 'min_...",0.307692,0.125,0.25,0.227564,0.076252,5


(108, 15)

# Picking the best model

In [28]:
best_model = grid.best_estimator_

Using that model to estimate performance on a test set

In [29]:
y_pred = best_model.predict(X_test)

f1 = best_model.score(X_test, y_test)

In [30]:
f1

0.9503105590062112