# 0 - Imports

In [23]:
import sys
import joblib

# Retornar para a pasta 'src' para poder importar os módulos criados em 'data/', 'utils/', etc
sys.path.insert(0, '../src/')


# import yaml consig file
from utils.utils import load_config_file

# Data load step
from data.data_load import DataLoad 

# Data validation step
from data.data_validation import DataValidation 

# Data transformation step
from data.data_transformation import DataTransformation 

# Data preprocessing step
from data.data_preprocessing import DataPreprocessing

# pipeline
from sklearn.pipeline import Pipeline

# imputers and discretizers
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser

# scaler
from sklearn.preprocessing import StandardScaler

# sklearn wrapper
from feature_engine.wrappers import SklearnTransformerWrapper

# import yaml consig file
from utils.utils import load_config_file

# Train model step
from train.train import ModelTraining

# Model validation step
from evaluation.classifier_evaluation import ModelEvaluating


from sklearn.linear_model import LogisticRegression

In [24]:
# Loading yaml file
yaml_file = load_config_file()

# 1 - Data loading

In [25]:
# Instantiating 'data_loader' object

data_loader = DataLoad()

# loading csv file as dataframe from yaml config file using 'load_data()' method 

df = data_loader.load_data('train_dataset_name')

df.head()

[2m2024-03-24 19:46:04[0m [[32m[1minfo     [0m] [1mStaring data loading with: train_dataset_name[0m
[2m2024-03-24 19:46:05[0m [[32m[1minfo     [0m] [1mData loaded successfully![0m


Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


# 2 - Data checking

In [26]:
# Instantiating 'data_validator' object

data_validator = DataValidation()


# run data validation
data_validator.run(df)

[2m2024-03-24 19:46:05[0m [[32m[1minfo     [0m] [1mStarting data shape checking[0m
[2m2024-03-24 19:46:05[0m [[32m[1minfo     [0m] [1mData shape validation started![0m
[2m2024-03-24 19:46:05[0m [[32m[1minfo     [0m] [1mStarting column types validation[0m
[2m2024-03-24 19:46:05[0m [[32m[1minfo     [0m] [1mColumn validation passed![0m
[2m2024-03-24 19:46:05[0m [[32m[1minfo     [0m] [1mValidation step concluded![0m


True

# 3 - Data transformation (split)

In [27]:
# Instantiating 'data_transformer' object

data_transformer = DataTransformation(df)


# getting X_train, X_val, y_train, y_val splits
X_train, X_val, y_train, y_val = data_transformer.train_test_data_split()

[2m2024-03-24 19:46:06[0m [[32m[1minfo     [0m] [1mTarget 'target' variable found in the dataframe. Performing X, y split and train/val split[0m
[2m2024-03-24 19:46:06[0m [[32m[1minfo     [0m] [1mData split performed successfully![0m


# 4 - Experimentations (MLFlow)

In [28]:
import mlflow
from mlflow.tracking import MlflowClient

In [34]:
# Definir o servidor do MLFlow:
# fazer a ligação entre os códigos executados nesse notebook com a UI do MLFlow
mlflow.set_tracking_uri(yaml_file.get('tracking_uri'))

#mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Criar um novo experimento:
mlflow.set_experiment(yaml_file.get('experiment_name')) # definir o nome

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1711251631903, experiment_id='1', last_update_time=1711251631903, lifecycle_stage='active', name='prob_loan_exp', tags={}>

## 4.1 - Select best model

In [30]:
# Código para selecionar o melhor modelo

In [40]:
# Obtendo o id do experimento utilizado no projeto

current_experiment=dict(mlflow.get_experiment_by_name(yaml_file.get('experiment_name')))

experiment_id=current_experiment['experiment_id']

experiment_id

'1'

In [44]:
# Obter tabela do MLFlow UI como Dataframe

mlflow_runs = mlflow.search_runs(experiment_id)

best_run_id = mlflow_runs.loc[mlflow_runs['metrics.CV_roc_auc'].idxmax(),'run_id']

best_run_id

'31a6bfe14ee947629e40fb9427df8b1a'

# Rascunhos

In [21]:
# Obtendo o id do experimento

experiment_name = "prob_loan"
current_experiment=dict(mlflow.get_experiment_by_name(experiment_name))
experiment_id=current_experiment['experiment_id']
current_experiment

{'artifact_location': 'mlflow-artifacts:/1',
 'creation_time': 1711210130178,
 'experiment_id': '1',
 'last_update_time': 1711212335290,
 'lifecycle_stage': 'deleted',
 'name': 'prob_loan',
 'tags': {}}

In [None]:
# Deletar experimento

import mlflow
mlflow.delete_experiment(experiment_id=1)

In [19]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
all_experiments = client.search_experiments()

all_experiments

[<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1711251631903, experiment_id='1', last_update_time=1711251631903, lifecycle_stage='active', name='prob_loan_exp', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1711250708036, experiment_id='0', last_update_time=1711250708036, lifecycle_stage='active', name='Default', tags={}>]

In [14]:
import mlflow
logged_model = 'runs:/cab6bf40fbd04598b853dcdfc3a70580/lr_model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [15]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: lr_model
  flavor: mlflow.sklearn
  run_id: cab6bf40fbd04598b853dcdfc3a70580

In [23]:
# Obter tabela do MLFlow UI como Dataframe

runs = mlflow.search_runs('1')

runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Val_roc_auc,metrics.CV_roc_auc,metrics.true_positives,metrics.log_loss,...,params.solver,params.max_iter,params.warm_start,tags.mlflow.runName,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.datasets,tags.model_name,tags.mlflow.source.name,tags.mlflow.source.type
0,11e3062b60574348ad33716682e13ae2,1,FINISHED,mlflow-artifacts:/1/11e3062b60574348ad33716682...,2024-03-24 19:54:13.664000+00:00,2024-03-24 19:54:23.257000+00:00,0.853043,0.850116,2346.0,0.637780,...,lbfgs,630,False,with_discretizer_hyperopt,"[{""run_id"": ""11e3062b60574348ad33716682e13ae2""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
1,4d0cf33873d848ae8c0ee7ee654b472e,1,FINISHED,mlflow-artifacts:/1/4d0cf33873d848ae8c0ee7ee65...,2024-03-24 19:54:03.497000+00:00,2024-03-24 19:54:13.608000+00:00,0.853221,0.850117,2345.0,0.637640,...,lbfgs,777,False,with_discretizer_hyperopt,"[{""run_id"": ""4d0cf33873d848ae8c0ee7ee654b472e""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
2,021043078a9a411187f01c12dd316f29,1,FINISHED,mlflow-artifacts:/1/021043078a9a411187f01c12dd...,2024-03-24 19:53:52.908000+00:00,2024-03-24 19:54:03.444000+00:00,0.851385,0.847558,2360.0,0.640328,...,lbfgs,818,False,with_discretizer_hyperopt,"[{""run_id"": ""021043078a9a411187f01c12dd316f29""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
3,32a05e75ae9b4cb6bfc4d1db4f9f26a5,1,FINISHED,mlflow-artifacts:/1/32a05e75ae9b4cb6bfc4d1db4f...,2024-03-24 19:53:41.737000+00:00,2024-03-24 19:53:52.858000+00:00,0.838979,0.835011,2875.0,0.675392,...,newton-cg,531,False,with_discretizer_hyperopt,"[{""run_id"": ""32a05e75ae9b4cb6bfc4d1db4f9f26a5""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
4,abfbce710d2949a8af6c7cb09c04e992,1,FINISHED,mlflow-artifacts:/1/abfbce710d2949a8af6c7cb09c...,2024-03-24 19:53:30.831000+00:00,2024-03-24 19:53:41.690000+00:00,0.798475,0.791855,132.0,0.204519,...,lbfgs,799,False,with_discretizer_hyperopt,"[{""run_id"": ""abfbce710d2949a8af6c7cb09c04e992""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,8b7d4484b2514f149797aa09c475c385,1,FINISHED,mlflow-artifacts:/1/8b7d4484b2514f149797aa09c4...,2024-03-24 19:38:50.058000+00:00,2024-03-24 19:38:58.436000+00:00,0.799371,0.792985,111.0,0.204775,...,lbfgs,637,True,with_discretizer_hyperopt,"[{""run_id"": ""8b7d4484b2514f149797aa09c475c385""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
98,0e8678dff9e14f70af4f697f10b8a70a,1,FINISHED,mlflow-artifacts:/1/0e8678dff9e14f70af4f697f10...,2024-03-24 19:38:40.859000+00:00,2024-03-24 19:38:50.017000+00:00,0.839000,0.834942,2875.0,0.675320,...,liblinear,267,False,with_discretizer_hyperopt,"[{""run_id"": ""0e8678dff9e14f70af4f697f10b8a70a""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
99,f5292869c43346babd23b7ea1f6afd00,1,FINISHED,mlflow-artifacts:/1/f5292869c43346babd23b7ea1f...,2024-03-24 19:38:30.422000+00:00,2024-03-24 19:38:40.815000+00:00,0.798674,0.792134,126.0,0.204535,...,newton-cg,330,True,with_discretizer_hyperopt,"[{""run_id"": ""f5292869c43346babd23b7ea1f6afd00""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer_hyperopt,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL
100,3e058eeb1ba145b3bf9eab7ccc11c0b8,1,FINISHED,mlflow-artifacts:/1/3e058eeb1ba145b3bf9eab7ccc...,2024-03-24 03:47:39.350000+00:00,2024-03-24 08:42:59.413000+00:00,0.798512,0.791916,132.0,0.204514,...,,,,with_discretizer,"[{""run_id"": ""3e058eeb1ba145b3bf9eab7ccc11c0b8""...",guhduarte,"[{""name"":""2ebb0c36f4b380f19dac359abbc730ef"",""h...",lr_discretizer,/home/guhduarte/.conda/envs/MLFlow_Loan/lib/py...,LOCAL


In [17]:
# Obter id do experimento pelo nome

from mlflow.tracking import MlflowClient

client = MlflowClient()

exp_id = client.get_experiment_by_name("prob_loan_exp").experiment_id

exp_id

'1'

In [77]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           12Gi       1.7Gi       9.7Gi       0.0Ki       993Mi        10Gi
Swap:         4.0Gi          0B       4.0Gi
