# 0. Imports

## 0.1 Bibliotecas

In [1]:
from IPython.display import display, HTML

In [2]:
import sys

sys.path.insert(0, '../src/')

import joblib
from sklearn.pipeline              import Pipeline
from sklearn.preprocessing         import StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation     import MeanMedianImputer
from feature_engine.wrappers       import SklearnTransformerWrapper
from sklearn.linear_model          import LogisticRegression

from utils.utils               import load_config_file
from data.dataload             import DataLoad
from data.datavalidation       import DataValidation
from data.datatranformation    import DataTransformation
from data.datapreprocessing    import DataPreprocess
from train.train               import TrainModel
from evaluation.classfier_eval import ModelEvaluation

## 0.2 Funções de Ajuda

## 0.3 Configurações do Jupyter

In [3]:
# Deixar o jupyper em widescreen
display(HTML("<style>.container { width:90% !important; }</style>"))

# Seta o máximo de colunas e linhas que o pandas vai exibir
# pd.set_option('display.max_columns', 20)
# pd.set_option('display.max_rows', 60)

# 1. DataLoad

In [16]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')
df.head(1)

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0


# 2.Data Validation 

In [17]:
dv = DataValidation()
dv.run(dataframe=df)

2024-01-05 14:54:28 [info     ] Validação Concluída           


True

# 3. DataTransformation

In [18]:
dt = DataTransformation(df)
X_train, X_val, y_train, y_val = dt.train_test_spliting()

# 4. Experimentation

In [7]:
import mlflow
from mlflow.tracking import MlflowClient

In [8]:
mlflow.set_tracking_uri('http://127.0.0.1:5000/')
mlflow.set_experiment('prob_learng')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1704470502137, experiment_id='1', last_update_time=1704470502137, lifecycle_stage='active', name='prob_learng', tags={}>

In [32]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')

    # 1. Processar os dados
    pipe = Pipeline([('imputer',  MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
                     ('scaler', SklearnTransformerWrapper(StandardScaler())) ])
    preprocessador = DataPreprocess(pipe=pipe)
    preprocessador.train(X_train)

    X_train = preprocessador.tranform(X_train)
    X_val = preprocessador.tranform(X_val)
    joblib.dump(preprocessador, '../models/preprocessador.joblib')

    # 1.1 Logar um artefato

    mlflow.log_artifacts('../models/preprocessador.joblib')

    # 1.2 logar os parametros do processador

    mlflow.log_params(params={'imputer': pipe['imputer'],
                              'scaler' : pipe['scaler']})
    
    # 2.0 inicia o experimento com cross validation

    model = LogisticRegression()
    model_eval = ModelEvaluation(model, X_train, y_train)
    roc_auc_curve = model_eval.cross_val_eval()

    # 2.1 Logar o Resultado da Performance

    mlflow.log_metric('train_roc_auc', roc_auc_curve.mean())
    
    # 3.0 Treinamento do Modelo

    model.fit(X_train, y_train)

    # 4.0 salvar as métricas de validação do modelo

    y_pred = model_eval.model.predict_proba(X_val)[:, 1]
    val_roc_curve = model_eval.evaluate_prediction(y_val, y_pred)
    mlflow.log_metric('valid_roc_curve', val_roc_curve)

     # 5.0 Logar o modelo criado
    mlflow.sklearn.log_model(model,
                             'lr_model',
                             pyfunc_predict_fn='predict_proba')

    mlflow.end_run()


## 4.1 Experiment 01

In [14]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [19]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')

    # 1. Processar os dados
    pipe = Pipeline([('imputer',  MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
                     ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discritiser'))),
                     ('scaler', SklearnTransformerWrapper(StandardScaler())) ])
    preprocessador = DataPreprocess(pipe=pipe)
    preprocessador.train(X_train)

    X_train = preprocessador.tranform(X_train)
    X_val = preprocessador.tranform(X_val)
    joblib.dump(preprocessador, '../models/preprocessador.joblib')

    # 1.1 Logar um artefato

    mlflow.log_artifacts('../models/preprocessador.joblib')

    # 1.2 logar os parametros do processador

    mlflow.log_params(params={'imputer': pipe['imputer'],
                              'discretizer' : pipe['discretizer'],
                              'scaler' : pipe['scaler']})
    
    # 2.0 inicia o experimento com cross validation

    model = LogisticRegression()
    model_eval = ModelEvaluation(model, X_train, y_train, n_splits=5)
    roc_auc_curve = model_eval.cross_val_eval()

    # 2.1 Logar o Resultado da Performance

    mlflow.log_metric('train_roc_auc', roc_auc_curve.mean())
    
    # 3.0 Treinamento do Modelo

    model.fit(X_train, y_train)

    # 4.0 salvar as métricas de validação do modelo

    y_pred = model_eval.model.predict_proba(X_val)[:, 1]
    val_roc_curve = model_eval.evaluate_prediction(y_val, y_pred)
    mlflow.log_metric('valid_roc_curve', val_roc_curve)

     # 5.0 Logar o modelo criado
    candidate_model_uri = mlflow.sklearn.log_model(model,
                                                   'lr_model').model_uri

    ######

    signature = infer_signature(X_val, y_val)

    eval_data = X_val
    eval_data['label'] = y_val

    thresholds = {
                     'accuracy_score': MetricThreshold(threshold=0.7,
                                                       min_absolute_change=0.05,
                                                       min_relative_change=0.05,
                                                       greater_is_better=True)
                 }

    baseline_model = DummyClassifier(strategy='uniform').fit(X_train, y_train)
    baseline_model_uri = mlflow.sklearn.log_model(baseline_model,
                                                  'baseline_model',
                                                  signature=signature).model_uri

    # processo responsável por avaliar o modelo do mlflow
    mlflow.evaluate(candidate_model_uri, 
                    eval_data,
                    targets = "label",
                    model_type = "classifier",
                    validation_thresholds = thresholds,
                    baseline_model = baseline_model_uri)

    # explicabilidade do modelo com SHAP

    mlflow.shap.log_explanation(model.predict,
                                X_val.drop('label', axis=1)) 
    
    mlflow.end_run()


Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.
Setuptools is replacing distutils.
Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.
Setuptools is replacing distutils.
Downloading artifacts: 100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 41.58it/s]
Downloading artifacts: 100%|██████

KeyboardInterrupt: 