# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Start

In [4]:
data_url = 'https://raw.githubusercontent.com/Hospital-Da-Luz-Learning-Health/MLCatolica24/main/Aula%208%20-%20Metrics%20%26%20Model%20Optimization/data/liver_disease.csv'
df = pd.read_csv(data_url, index_col = 'id')
df.head(3)
df.shape

Unnamed: 0_level_0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
572,75,1,32.960731,10.911642,0,0,8.930014,0,1,89.950963,1
125,20,1,30.298513,9.417347,0,0,1.484017,0,0,75.777573,1
1553,21,1,33.230869,1.685287,1,1,3.411027,0,0,69.226081,1


(804, 11)

In [23]:
X = df.drop(columns='Diagnosis')
y = df['Diagnosis']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression

Lookup the parameters in the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). Or ask chatgpt to help you build a parameter grid for logistic regression from sklearn

In [36]:
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced', None],
    'solver': ['liblinear']
}

In [37]:
grid = GridSearchCV(LogisticRegression(), # your model
                    param_grid, #the parameter grid
                    cv=3, #how many folds do you want in your cross-validation
                    scoring='f1', # what scoring metric do you want. More here: https://scikit-learn.org/stable/modules/model_evaluation.html
                    verbose=1, # to show the messages during training
                    n_jobs=-1 # how many cores to use in your computer
                    )

In [38]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


In [97]:
results = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
results.head(5)
results.shape

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
41,0.001133,2.7e-05,0.000726,2.1e-05,1.623777,balanced,l2,liblinear,"{'C': 1.623776739188721, 'class_weight': 'bala...",0.315789,0.225806,0.511628,0.351075,0.119324,1
45,0.001901,0.001055,0.001052,0.000479,4.281332,balanced,l2,liblinear,"{'C': 4.281332398719396, 'class_weight': 'bala...",0.321429,0.175439,0.52381,0.340226,0.142842,2
49,0.00136,0.000258,0.001009,0.0003,11.288379,balanced,l2,liblinear,"{'C': 11.288378916846883, 'class_weight': 'bal...",0.296296,0.206897,0.511628,0.338274,0.127898,3
40,0.00563,0.00098,0.001482,0.000583,1.623777,balanced,l1,liblinear,"{'C': 1.623776739188721, 'class_weight': 'bala...",0.301887,0.206897,0.5,0.336261,0.122103,4
36,0.003883,0.00037,0.000938,0.000133,0.615848,balanced,l1,liblinear,"{'C': 0.615848211066026, 'class_weight': 'bala...",0.321429,0.175439,0.5,0.332289,0.132724,5


(80, 15)

## Interpreting what features made the best models

In [93]:
features = results.filter(like='param_')
features = pd.get_dummies(features)
target = results['mean_test_score']

In [94]:
features.head(3)

Unnamed: 0,param_C,param_class_weight_balanced,param_penalty_l1,param_penalty_l2,param_solver_liblinear
41,1.623777,True,False,True,True
45,4.281332,True,False,True,True
49,11.288379,True,False,True,True


In [95]:
target.head(3)

41    0.351075
45    0.340226
49    0.338274
Name: mean_test_score, dtype: float64

### Using Linear Regression

In [98]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(features, target)

pd.Series(data=model.coef_, index=features.columns).sort_values(ascending=False).to_frame('Coeficient Value')

Unnamed: 0,Coeficient Value
param_class_weight_balanced,0.169087
param_penalty_l2,0.002197
param_C,1.4e-05
param_solver_liblinear,0.0
param_penalty_l1,-0.002197


Seems that only the hyperparameter `class_weight` set as `balanced` linearly affects the model (in this case, positively)

### Using Decision Tree

In [101]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=3)

model.fit(features, target)

pd.Series(data=model.feature_importances_, index=features.columns).sort_values(ascending=False).to_frame('Feature Importance')

Unnamed: 0,Feature Importance
param_C,0.546751
param_class_weight_balanced,0.453249
param_penalty_l1,0.0
param_penalty_l2,0.0
param_solver_liblinear,0.0
