# 0. Imports

## 0.1 Bibliotecas

In [1]:
from IPython.display import display, HTML

In [2]:
import sys

sys.path.insert(0, '../src/')

import joblib
from sklearn.pipeline              import Pipeline
from sklearn.preprocessing         import StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation     import MeanMedianImputer
from feature_engine.wrappers       import SklearnTransformerWrapper
from sklearn.linear_model          import LogisticRegression

from utils.utils               import load_config_file
from data.dataload             import DataLoad
from data.datavalidation       import DataValidation
from data.datatranformation    import DataTransformation
from data.datapreprocessing    import DataPreprocess
from train.train               import TrainModel
from evaluation.classfier_eval import ModelEvaluation

## 0.2 Funções de Ajuda

## 0.3 Configurações do Jupyter

In [3]:
# Deixar o jupyper em widescreen
display(HTML("<style>.container { width:90% !important; }</style>"))

# Seta o máximo de colunas e linhas que o pandas vai exibir
# pd.set_option('display.max_columns', 20)
# pd.set_option('display.max_rows', 60)

# 1. DataLoad

In [4]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')
df.head(1)

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0


# 2.Data Validation 

In [5]:
dv = DataValidation()
dv.run(dataframe=df)

2024-01-05 15:35:22 [info     ] Validação Concluída           


True

# 3. DataTransformation

In [17]:
dt = DataTransformation(df)
X_train, X_val, y_train, y_val = dt.train_test_spliting()

# 4. Experimentation

In [3]:
import mlflow
from mlflow.tracking import MlflowClient

In [4]:
mlflow.set_tracking_uri('http://127.0.0.1:5000/')
mlflow.set_experiment('prob_learng')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1704470502137, experiment_id='1', last_update_time=1704470502137, lifecycle_stage='active', name='prob_learng', tags={}>

In [9]:
# with mlflow.start_run(run_name='baseline'):
#     mlflow.set_tag('model_name', 'lr_baseline')

#     # 1. Processar os dados
#     pipe = Pipeline([('imputer',  MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
#                      ('scaler', SklearnTransformerWrapper(StandardScaler())) ])
#     preprocessador = DataPreprocess(pipe=pipe)
#     preprocessador.train(X_train)

#     X_train = preprocessador.tranform(X_train)
#     X_val = preprocessador.tranform(X_val)
#     joblib.dump(preprocessador, '../models/preprocessador.joblib')

#     # 1.1 Logar um artefato

#     mlflow.log_artifacts('../models/preprocessador.joblib')

#     # 1.2 logar os parametros do processador

#     mlflow.log_params(params={'imputer': pipe['imputer'],
#                               'scaler' : pipe['scaler']})
    
#     # 2.0 inicia o experimento com cross validation

#     model = LogisticRegression()
#     model_eval = ModelEvaluation(model, X_train, y_train)
#     roc_auc_curve = model_eval.cross_val_eval()

#     # 2.1 Logar o Resultado da Performance

#     mlflow.log_metric('train_roc_auc', roc_auc_curve.mean())
    
#     # 3.0 Treinamento do Modelo

#     model.fit(X_train, y_train)

#     # 4.0 salvar as métricas de validação do modelo

#     y_pred = model_eval.model.predict_proba(X_val)[:, 1]
#     val_roc_curve = model_eval.evaluate_prediction(y_val, y_pred)
#     mlflow.log_metric('valid_roc_curve', val_roc_curve)

#      # 5.0 Logar o modelo criado
#     mlflow.sklearn.log_model(model,
#                              'lr_model',
#                              pyfunc_predict_fn='predict_proba')

#     mlflow.end_run()


## 4.1 Hyperparameters

In [5]:
from hyperopt import fmin, tpe, hp, STATUS_OK
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [6]:
pipe = Pipeline([('imputer',  MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
                 ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discritiser'))),
                 ('scaler', SklearnTransformerWrapper(StandardScaler())) ])

In [7]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')
dt = DataTransformation(df)
X_train, X_val, y_train, y_val = dt.train_test_spliting()

def objective(params):
    with mlflow.start_run(run_name='with_discretizer_hyperopt'):
        mlflow.set_tag('model_name', 'lr_hyperopt')
        mlflow.log_params(params)
        
    
        preprocessador = DataPreprocess(pipe)
        preprocessador.train(X_train)
        
        X_train = preprocessador.transform(X_train)
        X_val = preprocessador.transform(X_val)
        joblib.dump(preprocessador, '../models/preprocess.joblib')
        
        ## 1.1. logar um artefato (preprocessador)
        mlflow.log_artifact('../models/preprocess.joblib')
        
        ## 1.2. logar os parametros do preprocessador
        mlflow.log_params(params={'imputer': pipe['imputer'],
                                'discretizer': pipe['discretizer'],
                                'scaler': pipe['scaler']})
        
        # 2.0. inicia o experimento com cross validatin
        model = LogisticRegression(**params)
        model_eval = ModelEvaluation(model,
                                    X_train,
                                    y_train,
                                    n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()
        ## 2.1. logar o resultado da performance
        mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())
        
        # 3.0. treinar o modelo
        model.fit(X_train, y_train)
        
        # 4.0. salvar as metricas de validacao do modelo
        y_val_preds = model_eval.model.predict_proba(X_val)[:, 1]
        val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)
        
        mlflow.log_metric('valid_roc_auc', val_roc_auc)
        
        # 5.0. logar o modelo
        candidate_model_uri = mlflow.sklearn.log_model(model,
                                'lr_model').model_uri
        
        ########
        signature = infer_signature(X_val, y_valid)
        
        eval_data = X_val
        eval_data['label'] = y_valid
        
        thresholds = {
            "accuracy_score": MetricThreshold(
                threshold=0.1, # o score da acuracia precisa ser >0.7
                min_absolute_change=0.05,
                min_relative_change=0.05, 
                greater_is_better=True
            )
        }
        
        baseline_model = DummyClassifier(strategy='uniform').fit(X_train, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model,
                                                    "baseline_model",
                                                    signature=signature).model_uri
        
        # processo responsavel por avaliar o modelo do mlflow
        mlflow.evaluate(candidate_model_uri,
                        eval_data,
                        targets="label",
                        model_type="classifier",
                        validation_thresholds=thresholds,
                        baseline_model=baseline_model_uri)
        
        
        mlflow.end_run()
        
        return {'loss': -roc_auc_scores.mean(),
                'status': STATUS_OK}

In [8]:
search_space = {'warm_start' : hp.choice('warm_start', [True, False]),
                'fit_intercept' : hp.choice('fit_intercept', [True, False]),
                'tol' : hp.uniform('tol', 0.00001, 0.0001),
                'C' : hp.uniform('C', 0.05, 3),
                'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'max_iter' : hp.choice('max_iter', range(100,1000)),
                'multi_class' : 'auto',
                'class_weight' : hp.choice('class_weight', [None, 'balanced'])}

In [22]:
best_result = fmin(fn=objective,
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=5)

  0%|                                                                            | 0/5 [00:00<?, ?trial/s, best loss=?]

job exception: cannot access local variable 'X_train' where it is not associated with a value



  0%|                                                                            | 0/5 [00:00<?, ?trial/s, best loss=?]


UnboundLocalError: cannot access local variable 'X_train' where it is not associated with a value