## Imports

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

import logging
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [18]:
data = pd.read_csv('../../data/external/bankloan_cleaned.csv')

In [19]:
data.head()

Unnamed: 0,age,experience,income,zip_code,family,cc_avg,education,mortgage,personal_loan,securities_account,cd_account,online,credit_card
0,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [20]:
data.columns

Index(['age', 'experience', 'income', 'zip_code', 'family', 'cc_avg',
       'education', 'mortgage', 'personal_loan', 'securities_account',
       'cd_account', 'online', 'credit_card'],
      dtype='object')

## Modelagem

In [21]:
X, y = data.drop(columns='credit_card'), data['credit_card']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

In [22]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')
mlflow.set_experiment('Modelo Protótipo')
mlflow.sklearn.autolog(silent=True)

In [23]:
def rand_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'RandomSearchCV_{model.__class__.__name__}'):
        rand_search = RandomizedSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        rand_search.fit(X_train, y_train)
        
        best_model = rand_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {rand_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [24]:
def grid_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'GridSearchCV{model.__class__.__name__}'):
        grid_search = GridSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {grid_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [25]:
def bayesian_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'BayesSearchCV{model.__class__.__name__}'):
        bayesian_search = BayesSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        bayesian_search.fit(X_train, y_train)
        
        best_model = bayesian_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {bayesian_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [26]:
param_grid_rf = {'n_estimators': [10, 50, 100],
                 'criterion': ['entropy', 'gini'],
                 'max_depth': [3, 5, 10, None]}

param_grid_gb = {'n_estimators': [10, 50, 100],
                 'learning_rate': [0.01, 0.1, 0.2],
                 'max_depth': [10, 20, 30]}

param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}

## Random Search Cross Validation

In [27]:
rand_search_cv(RandomForestClassifier(), param_grid_rf)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024/11/10 18:02:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomSearchCV_RandomForestClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/0d468e8a42a24f18bae59bc9a766e59c.
2024/11/10 18:02:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


In [28]:
rand_search_cv(GradientBoostingClassifier(), param_grid_gb)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024/11/10 18:06:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomSearchCV_GradientBoostingClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/09ae1c7b6a874bf4af417dd6f6a95f9d.
2024/11/10 18:06:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


In [29]:
rand_search_cv(KNeighborsClassifier(), param_grid_knn)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


2024/11/10 18:06:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomSearchCV_KNeighborsClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/265526866ed04ce1a71d4fd3dc862944.
2024/11/10 18:06:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


## Grid Search Cross Validation

In [30]:
grid_search_cv(RandomForestClassifier(), param_grid_rf)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


2024/11/10 18:07:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run GridSearchCVRandomForestClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/ac67d283576242248ef90e6de2ddbb17.
2024/11/10 18:07:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


In [31]:
grid_search_cv(GradientBoostingClassifier(), param_grid_gb)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


2024/11/10 18:14:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run GridSearchCVGradientBoostingClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/33f6391456e849e491230de8c2842b8e.
2024/11/10 18:14:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


In [32]:
grid_search_cv(KNeighborsClassifier(), param_grid_knn)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


2024/11/10 18:14:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run GridSearchCVKNeighborsClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/532afabe6691499b99871976c5ac28b9.
2024/11/10 18:14:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


## Bayesian Search Cross Validation

In [33]:
bayesian_search_cv(RandomForestClassifier(), param_grid_rf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/10 18:26:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run BayesSearchCVRandomForestClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/deb0617a58ae460e8e76b379c7ef38b2.
2024/11/10 18:26:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


In [34]:
bayesian_search_cv(GradientBoostingClassifier(), param_grid_gb)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/10 18:51:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run BayesSearchCVGradientBoostingClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/f56044f8936a42188114cc1dc6399b76.
2024/11/10 18:51:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.


In [35]:
bayesian_search_cv(KNeighborsClassifier(), param_grid_knn)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/10 18:59:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run BayesSearchCVKNeighborsClassifier at: http://127.0.0.1:8080/#/experiments/907665342862090684/runs/60c62d22c17f4c9ba95a510af4bc00a8.
2024/11/10 18:59:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/907665342862090684.
