# 0.0. Imports

In [1]:
import sys
import joblib 

sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from utils.utils import load_config_file
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from sklearn.linear_model import LogisticRegression
from evaluation.classifier_eval import ModelEvaluation

# 1.0. Data load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2023-12-04 20:09:29[0m [[32m[1minfo     [0m] [1mComecando a carga dos dados com o nome: train_dataset_name[0m


# 2.0. Data validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

[2m2023-12-04 20:09:30[0m [[32m[1minfo     [0m] [1mValidacao iniciou[0m
[2m2023-12-04 20:09:30[0m [[32m[1minfo     [0m] [1mValidation columns passed...[0m
[2m2023-12-04 20:09:30[0m [[32m[1minfo     [0m] [1mValidacao com sucesso.[0m


# 3.0. Data transformation

In [4]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

# 4.0. Experimentations

In [5]:
import mlflow 
from mlflow.tracking import MlflowClient


* 'schema_extra' has been renamed to 'json_schema_extra'


In [6]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1701729825978, experiment_id='1', last_update_time=1701729825978, lifecycle_stage='active', name='prob_loan', tags={}>

## 4.1. Hyperparameters

In [8]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [9]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [12]:
pipe = Pipeline([('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
                 ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))])

In [14]:
def objective(params):
    with mlflow.start_run(run_name='with_discretizer_hyperopt'):
        mlflow.set_tag('model_name', 'lr_hyperopt')
        mlflow.log_params(params)
        
    
        preprocessador = DataPreprocess(pipe)
        preprocessador.train(X_train)
        
        X_train_processed = preprocessador.transform(X_train)
        X_valid_processed = preprocessador.transform(X_valid)
        joblib.dump(preprocessador, '../models/preprocess.joblib')
        
        ## 1.1. logar um artefato (preprocessador)
        mlflow.log_artifact('../models/preprocess.joblib')
        
        ## 1.2. logar os parametros do preprocessador
        mlflow.log_params(params={'imputer': pipe['imputer'],
                                'discretizer': pipe['discretizer'],
                                'scaler': pipe['scaler']})
        
        # 2.0. inicia o experimento com cross validatin
        model = LogisticRegression(**params)
        model_eval = ModelEvaluation(model,
                                    X_train_processed,
                                    y_train,
                                    n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()
        ## 2.1. logar o resultado da performance
        mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())
        
        # 3.0. treinar o modelo
        model.fit(X_train_processed, y_train)
        
        # 4.0. salvar as metricas de validacao do modelo
        y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:, 1]
        val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)
        
        mlflow.log_metric('valid_roc_auc', val_roc_auc)
        
        # 5.0. logar o modelo
        candidate_model_uri = mlflow.sklearn.log_model(model,
                                'lr_model').model_uri
        
        ########
        signature = infer_signature(X_valid_processed, y_valid)
        
        eval_data = X_valid_processed
        eval_data['label'] = y_valid
        
        thresholds = {
            "accuracy_score": MetricThreshold(
                threshold=0.1, # o score da acuracia precisa ser >0.7
                min_absolute_change=0.05,
                min_relative_change=0.05, 
                greater_is_better=True
            )
        }
        
        baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model,
                                                    "baseline_model",
                                                    signature=signature).model_uri
        
        # processo responsavel por avaliar o modelo do mlflow
        mlflow.evaluate(candidate_model_uri,
                        eval_data,
                        targets="label",
                        model_type="classifier",
                        validation_thresholds=thresholds,
                        baseline_model=baseline_model_uri)
        
        
        mlflow.end_run()
        
        return {'loss': -roc_auc_scores.mean(),
                'status': STATUS_OK}

In [11]:
search_space = {'warm_start' : hp.choice('warm_start', [True, False]),
                'fit_intercept' : hp.choice('fit_intercept', [True, False]),
                'tol' : hp.uniform('tol', 0.00001, 0.0001),
                'C' : hp.uniform('C', 0.05, 3),
                'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'max_iter' : hp.choice('max_iter', range(100,1000)),
                'multi_class' : 'auto',
                'class_weight' : hp.choice('class_weight', [None, 'balanced'])}

In [None]:
best_result = fmin(fn=lambda params: objective(params, X_train, X_val),
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=5)