## Imports

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

import logging
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [29]:
data = pd.read_csv('../../data/preprocessed/preprocessed_data.csv')
data_target = pd.read_csv('../../data/feature_store/data_with_new_features.csv')

In [30]:
data.head()

Unnamed: 0,age_bracket_name,personal_loan,education,securities_account,cd_account,online,age,experience,income,family,mortgage,mortgage_log,age_bracket_name_Baby boomers,age_bracket_name_Generation X,age_bracket_name_Generation Z,age_bracket_name_Millennials,education_1,education_2,education_3
0,Generation Z,0,1,1,0,0,25,1,49,4,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,Generation X,0,1,1,0,0,45,19,34,3,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,Millennials,0,1,0,0,0,39,15,11,1,0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,Millennials,0,2,0,0,0,35,9,100,1,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Millennials,0,2,0,0,0,35,8,45,4,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [31]:
data.columns

Index(['age_bracket_name', 'personal_loan', 'education', 'securities_account',
       'cd_account', 'online', 'age', 'experience', 'income', 'family',
       'mortgage', 'mortgage_log', 'age_bracket_name_Baby boomers',
       'age_bracket_name_Generation X', 'age_bracket_name_Generation Z',
       'age_bracket_name_Millennials', 'education_1', 'education_2',
       'education_3'],
      dtype='object')

## Modelagem

In [32]:
features = ['personal_loan', 'education', 'securities_account',
            'cd_account', 'online', 'age_bracket_name_Baby boomers',
            'age_bracket_name_Generation X', 'age_bracket_name_Generation Z',
            'age_bracket_name_Millennials', 'education_1', 'education_2',
            'education_3']

X, y = data[features], data_target['credit_card']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

In [64]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [66]:
mlflow.set_experiment('prototype-model')

2024/11/10 20:07:41 INFO mlflow.tracking.fluent: Experiment with name 'prototype-model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/841528861803618626', creation_time=1731280061473, experiment_id='841528861803618626', last_update_time=1731280061473, lifecycle_stage='active', name='prototype-model', tags={}>

In [67]:
def rand_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'RandomSearchCV_{model.__class__.__name__}'):
        rand_search = RandomizedSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        rand_search.fit(X_train, y_train)

        best_model = rand_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {rand_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [68]:
def grid_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'GridSearchCV{model.__class__.__name__}'):
        grid_search = GridSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {grid_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [69]:
def bayesian_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'BayesSearchCV{model.__class__.__name__}'):
        bayesian_search = BayesSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        bayesian_search.fit(X_train, y_train)
        
        best_model = bayesian_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {bayesian_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [70]:
param_grid_rf = {'n_estimators': [10, 50, 100],
                 'criterion': ['entropy', 'gini'],
                 'max_depth': [3, 5, 10, None]}

param_grid_gb = {'n_estimators': [10, 50, 100],
                 'learning_rate': [0.01, 0.1, 0.2],
                 'max_depth': [10, 20, 30]}

param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}

## Random Search Cross Validation

In [71]:
rand_search_cv(RandomForestClassifier(), param_grid_rf)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024/11/10 20:08:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomSearchCV_RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/e1bf52f100404dd18cc6d19b2e187e67.
2024/11/10 20:08:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


In [72]:
rand_search_cv(GradientBoostingClassifier(), param_grid_gb)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024/11/10 20:08:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomSearchCV_GradientBoostingClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/27e8d7b305424920a579d6d954e52d0f.
2024/11/10 20:08:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


In [73]:
rand_search_cv(KNeighborsClassifier(), param_grid_knn)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


2024/11/10 20:08:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomSearchCV_KNeighborsClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/50cbd4bd30794c9e80969f30f3729d91.
2024/11/10 20:08:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


## Grid Search Cross Validation

In [74]:
grid_search_cv(RandomForestClassifier(), param_grid_rf)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


2024/11/10 20:08:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run GridSearchCVRandomForestClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/94cf77d5652c4280a02715e53a63dae1.
2024/11/10 20:08:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


In [75]:
grid_search_cv(GradientBoostingClassifier(), param_grid_gb)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


2024/11/10 20:08:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run GridSearchCVGradientBoostingClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/6652d0f5a1f94a70ae8a4c79d7f55607.
2024/11/10 20:08:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


In [76]:
grid_search_cv(KNeighborsClassifier(), param_grid_knn)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


2024/11/10 20:08:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run GridSearchCVKNeighborsClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/6b16ccd9a6e5468dbd5ef52b6ff5bf35.
2024/11/10 20:08:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


## Bayesian Search Cross Validation

In [77]:
bayesian_search_cv(RandomForestClassifier(), param_grid_rf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/10 20:13:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run BayesSearchCVRandomForestClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/bf601a991cb344d799eaeae5e3234ea5.
2024/11/10 20:13:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


In [78]:
bayesian_search_cv(GradientBoostingClassifier(), param_grid_gb)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/10 20:18:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run BayesSearchCVGradientBoostingClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/a2a90afbca1f4f03af4733f2e12cfab1.
2024/11/10 20:18:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.


In [79]:
bayesian_search_cv(KNeighborsClassifier(), param_grid_knn)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

2024/11/10 20:21:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run BayesSearchCVKNeighborsClassifier at: http://127.0.0.1:5000/#/experiments/841528861803618626/runs/02f3d67e3be64b5fb999aa24b3845d8a.
2024/11/10 20:21:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/841528861803618626.
