# 0 - Imports

In [1]:
import sys
import joblib

# Retornar para a pasta 'src' para poder importar os módulos criados em 'data/', 'utils/', etc
sys.path.insert(0, '../src/')


# import yaml consig file
from utils.utils import load_config_file

# Data load step
from data.data_load import DataLoad 

# Data validation step
from data.data_validation import DataValidation 

# Data transformation step
from data.data_transformation import DataTransformation 

# Data preprocessing step
from data.data_preprocessing import DataPreprocessing

# pipeline
from sklearn.pipeline import Pipeline

# imputers and discretizers
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser

# scaler
from sklearn.preprocessing import StandardScaler

# sklearn wrapper
from feature_engine.wrappers import SklearnTransformerWrapper

# import yaml consig file
from utils.utils import load_config_file

# Train model step
from train.train import ModelTraining

# Model validation step
from evaluation.classifier_evaluation import ModelEvaluating


from sklearn.linear_model import LogisticRegression

In [2]:
# Loading yaml file
yaml_file = load_config_file()

# 1 - Data loading

In [3]:
# Instantiating 'data_loader' object

data_loader = DataLoad()

# loading csv file as dataframe from yaml config file using 'load_data()' method 

df = data_loader.load_data('train_dataset_name')

df.head()

[2m2024-03-24 12:43:10[0m [[32m[1minfo     [0m] [1mStaring data loading with: train_dataset_name[0m


[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mData loaded successfully![0m


Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


# 2 - Data checking

In [4]:
# Instantiating 'data_validator' object

data_validator = DataValidation()


# run data validation
data_validator.run(df)

[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mStarting data shape checking[0m
[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mData shape validation started![0m
[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mStarting column types validation[0m
[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mColumn validation passed![0m
[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mValidation step concluded![0m


True

# 3 - Data transformation (split)

In [5]:
# Instantiating 'data_transformer' object

data_transformer = DataTransformation(df)


# getting X_train, X_val, y_train, y_val splits
X_train, X_val, y_train, y_val = data_transformer.train_test_data_split()

[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mTarget 'target' variable found in the dataframe. Performing X, y split and train/val split[0m
[2m2024-03-24 12:43:11[0m [[32m[1minfo     [0m] [1mData split performed successfully![0m


# 4 - Experimentations (MLFlow)

In [6]:
import mlflow
from mlflow.tracking import MlflowClient

In [7]:
# Definir o servidor do MLFlow:
# fazer a ligação entre os códigos executados nesse notebook com a UI do MLFlow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

#mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Criar um novo experimento:
mlflow.set_experiment('prob_loan_exp') # definir o nome

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1711251631903, experiment_id='1', last_update_time=1711251631903, lifecycle_stage='active', name='prob_loan_exp', tags={}>

In [8]:
# Criar um experimento base (execução/run)

with mlflow.start_run(run_name='baseline_test'):

    mlflow.set_tag('model_name', 'lr_baseline')


    # 1.0 Preprocessar os dados: =============================================

    pipeline =  Pipeline(
                    [
                     ('imputer', MeanMedianImputer(variables=yaml_file['variables_imputer'])),
                                         
                     ('scaler', SklearnTransformerWrapper(StandardScaler()))
                                                                              ]
                      )
    
    # Instantiating 'data_preprocessor' object from DataPreprocessing class
    data_preprocessor = DataPreprocessing(pipeline)

    # Fitting pipeline
    data_preprocessor.fit_pipeline(X_train)


    # Transforming data
    
    X_train_processed = data_preprocessor.transform_pipeline(X_train) # X_train
    
    X_val_processed = data_preprocessor.transform_pipeline(X_val) # X_val

    joblib.dump(data_preprocessor, '../models/preprocessor.joblib')

    # ------------------------------------------------------------------------------


    # 1.1 logar um artefato (proprocessor)
    mlflow.log_artifact('../models/preprocessor.joblib')

    # 1.2 logar os parâmetros do proprocessador
    mlflow.log_params(params={'imputer':pipeline['imputer'],
                              'scaler':pipeline['scaler']})
    


    # 2.0 Treinamento do modelo e Cross validation: =============================================

    model_instance = LogisticRegression()

    # Instantiating 'model_trainer' object from DataPreprocessing class
    model_trainer = ModelTraining(X_train_processed, y_train)

    # Fitting model
    fitted_model = model_trainer.fit(model_instance)

    # Instantiating 'model_evaluator' object from DataPreprocessing class
    model_evaluator = ModelEvaluating(fitted_model, X_train_processed, y_train)


    # Getting metric scores
    scores = model_evaluator.cross_val_evaluation()

    # ------------------------------------------------------------------------------


    # 2.1 logar o resultado da métrica do modelo
    mlflow.log_metric('CV_roc_auc', scores.mean())



    # 3.0 Treinamento do modelo =============================================
    
    # Fitting model
    model_instance.fit(X_train_processed, y_train)


    # 4.0 Salvar metricas do X_val
    y_val_probas = model_instance.predict_proba(X_val_processed)[:,1]
    X_val_roc_auc = model_evaluator.evaluate_predictions(y_val, y_val_probas)

    # logar o resultado da métrica do modelo
    mlflow.log_metric('Val_roc_auc', X_val_roc_auc)



    # 5.0 Logar o modelo
    mlflow.sklearn.log_model(model_instance, 'lr_model', pyfunc_predict_fn='predict_proba')


    # end run
    mlflow.end_run()


[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mPipeline started...[0m
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mPipeline fitting started...[0m
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mPipeline fitting finished![0m
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline started...[0m
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline finished![0m
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline started...[0m
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline finished![0m
/home/guhduarte/Repos/MLFlow_Loan/project/models/Logistic_Regression.joblib
[2m2024-03-24 11:53:10[0m [[32m[1minfo     [0m] [1mCross validation started...[0m
[2m2024-03-24 11:53:12[0m [[32m[1minfo     [0m] [1mFitted model: LogisticRegression()[0m
[2m2024-03-24 11:53:12[0m [[

## 4.1 - Experimento 1

In [None]:
# No experimento anterior (run = 'baseline') foi realizado o fit do modelo sem o discretizer no pipeline
# No experimento abaixo será incluído o discretizer ao pipeline

In [11]:
# Importar módulo que salva o experimento apenas de atingir um threshold predefinido
from mlflow.models import MetricThreshold, infer_signature

# Importar classificador dummy para comparação
from sklearn.dummy import DummyClassifier

In [None]:
# Criar um experimento base (execução/run)

with mlflow.start_run(run_name='with_discretizer'):

    mlflow.set_tag('model_name', 'lr_discretizer')


    # 1.0 Preprocessar os dados: =============================================

    pipeline =  Pipeline(
                    [
                     ('imputer', MeanMedianImputer(variables=yaml_file['variables_imputer'])),


                     ('discretizer', EqualFrequencyDiscretiser(variables=yaml_file['variables_discretiser'])),                  
                     
                                        
                     ('scaler', SklearnTransformerWrapper(StandardScaler()))
                                                                              ]
                      )
    
    # Instantiating 'data_preprocessor' object from DataPreprocessing class
    data_preprocessor = DataPreprocessing(pipeline)

    # Fitting pipeline
    data_preprocessor.fit_pipeline(X_train)


    # Transforming data
    
    X_train_processed = data_preprocessor.transform_pipeline(X_train) # X_train
    
    X_val_processed = data_preprocessor.transform_pipeline(X_val) # X_val

    joblib.dump(data_preprocessor, '../models/preprocessor.joblib')

    # ------------------------------------------------------------------------------


    # 1.1 logar um artefato (proprocessor)
    mlflow.log_artifact('../models/preprocessor.joblib')

    # 1.2 logar os parâmetros do proprocessador
    mlflow.log_params(params={'imputer':pipeline['imputer'],
                              'discretizer':pipeline['discretizer'],
                              'scaler':pipeline['scaler']})
    


    # 2.0 Treinamento do modelo e Cross validation: =============================================

    model_instance = LogisticRegression()

    # Instantiating 'model_trainer' object from DataPreprocessing class
    model_trainer = ModelTraining(X_train_processed, y_train)

    # Fitting model
    fitted_model = model_trainer.fit(model_instance)

    # Instantiating 'model_evaluator' object from DataPreprocessing class
    model_evaluator = ModelEvaluating(fitted_model, X_train_processed, y_train)


    # Getting metric scores
    scores = model_evaluator.cross_val_evaluation()

    # ------------------------------------------------------------------------------


    # 2.1 logar o resultado da métrica do modelo
    mlflow.log_metric('CV_roc_auc', scores.mean())



    # 3.0 Treinamento do modelo =============================================
    
    # Fitting model
    model_instance.fit(X_train_processed, y_train)


    # 4.0 Salvar metricas do X_val
    y_val_probas = model_instance.predict_proba(X_val_processed)[:,1]
    X_val_roc_auc = model_evaluator.evaluate_predictions(y_val, y_val_probas)

    # logar o resultado da métrica do modelo
    mlflow.log_metric('Val_roc_auc', X_val_roc_auc)



    # 5.0 Logar o modelo candidato e capturar a uri para mlflow.evaluate
    model_candidate_uri = mlflow.sklearn.log_model(model_instance,
                                               'lr_model').model_uri


    # ===============================

    # Inferir assinatura dos dados (aparece o dataset no MLFlow)
    signature = infer_signature(X_val_processed, y_val)

    # Separar os dados para avaliação do MLFlow (p/ usar no método mlflow.evaluate()):
    eval_data = X_val_processed.copy() # criando uma cópia de X_val_processed
    eval_data['label'] = y_val # criando a coluna target


    # Definindo o threshold do experimento
    thresholds = {
        yaml_file.get('metric_to_evaluate'): MetricThreshold(
            
            threshold=yaml_file.get('metric_threshold'), # a métrica precisa ser maior que esse valor para que o experimento seja salvo
            
            min_absolute_change=yaml_file.get('metric_min_absolute_change'),
            
            min_relative_change=yaml_file.get('metric_min_relative_change'),
            
            greater_is_better=yaml_file.get('greater_is_better')
        )
    }

    # Instanciando o classificador dummy
    baseline_model_instance = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)

    
    # Logar o modelo dummy e capturar a uri para mlflow.evaluate
    baseline_model_instance_uri = mlflow.sklearn.log_model(baseline_model_instance,
                                                           'baseline_model',
                                                           signature=signature).model_uri
    


    # Iniciar avaliação:

    # processo responsável por avaliar/comparar modelos no MLFlow
    mlflow.evaluate(model_candidate_uri,
                    eval_data,
                    targets='label',
                    model_type=yaml_file.get('model_type'),
                    validation_thresholds=thresholds,
                    baseline_model=baseline_model_instance_uri)
    

    # Explicabilidade com SHAP
    mlflow.shap.log_explanation(model_instance.predict, X_val_processed)


    # end run
    mlflow.end_run()


In [21]:
0

0

# Rascunhos

In [21]:
# Obtendo o id do experimento

experiment_name = "prob_loan"
current_experiment=dict(mlflow.get_experiment_by_name(experiment_name))
experiment_id=current_experiment['experiment_id']
current_experiment

{'artifact_location': 'mlflow-artifacts:/1',
 'creation_time': 1711210130178,
 'experiment_id': '1',
 'last_update_time': 1711212335290,
 'lifecycle_stage': 'deleted',
 'name': 'prob_loan',
 'tags': {}}

In [22]:
# Obtendo o id do experimento

experiment_name = "prob_loan_exp"
current_experiment=dict(mlflow.get_experiment_by_name(experiment_name))
experiment_id=current_experiment['experiment_id']
current_experiment

{'artifact_location': 'mlflow-artifacts:/2',
 'creation_time': 1711212708474,
 'experiment_id': '2',
 'last_update_time': 1711212708474,
 'lifecycle_stage': 'active',
 'name': 'prob_loan_exp',
 'tags': {}}

In [None]:
# Deletar experimento

import mlflow
mlflow.delete_experiment(experiment_id=1)

RestException: RESOURCE_DOES_NOT_EXIST: No Experiment with id=1 exists

In [11]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
all_experiments = client.search_experiments()

all_experiments

[<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1711212708474, experiment_id='2', last_update_time=1711212708474, lifecycle_stage='active', name='prob_loan_exp', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1711209927294, experiment_id='0', last_update_time=1711209927294, lifecycle_stage='active', name='Default', tags={}>]

In [14]:
import mlflow
logged_model = 'runs:/cab6bf40fbd04598b853dcdfc3a70580/lr_model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [15]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: lr_model
  flavor: mlflow.sklearn
  run_id: cab6bf40fbd04598b853dcdfc3a70580

In [77]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           12Gi       1.7Gi       9.7Gi       0.0Ki       993Mi        10Gi
Swap:         4.0Gi          0B       4.0Gi
