In [None]:
import optuna
import mlflow
import pandas as pd
import numpy as np
import metrics

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

In [None]:
base = pd.read_csv(r"", sep=";")

In [None]:
dt = base
dt

In [None]:
base.drop([], axis =1, inplace = True)

In [None]:
vars_selected = []

In [None]:
X_train = base[base.destino == "Train"]
y_train = base.label[base.destino == "Train"]
X_test = base[base.destino == "Test"]
y_test = base.label[base.destino == "Test"]

In [None]:
X_train.drop(["destino","label"],axis = 1,inplace = True)
X_test.drop(["destino","label"],axis = 1,inplace = True)

In [None]:
X_train = X_train[vars_selected]
X_test = X_test[vars_selected]

# AdaBoost

In [None]:
from sklearn import metrics

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0, 1),
    }
    
    clf = AdaBoostClassifier(
        base_estimator = None,
        random_state=42).set_params(**params)

    clf.fit(X_train, y_train)
    
    probs_train = clf.predict_proba(X_train)[:, 1]
    probs_test = clf.predict_proba(X_test)[:, 1]
    
    roc_auc_train = roc_auc_score(y_train, probs_train)
    roc_auc_test = roc_auc_score(y_test, probs_test)
    
    fpr1, tpr1, threshold1 = metrics.roc_curve(y_train, clf.predict_proba(X_train)[:,1])
    fpr2, tpr2, threshold2 = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])
    
    train_auc = metrics.auc(fpr1,tpr1)
    test_auc =metrics.auc(fpr2,tpr2)
    
    loss = (1 - roc_auc_train)+(roc_auc_train - roc_auc_test)
    
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_metric('roc_auc_train', roc_auc_train)
        mlflow.log_metric('roc_auc_test', roc_auc_test)
        mlflow.log_metric('score_train_auc', train_auc)
        mlflow.log_metric('score_test_auc', test_auc)
    
    return loss

In [None]:
# 460 minutos de rutina

In [None]:
with mlflow.start_run() as run:
    print("tags.mlflow.parentRunId = '" + run.info.run_id + "'")
    
    sampler = optuna.samplers.RandomSampler(seed=15)

    study = optuna.create_study(sampler=sampler, direction='minimize')
    study.optimize(objective, n_trials=300, show_progress_bar=True)

In [None]:
study.best_params

In [None]:
clf = AdaBoostClassifier(n_estimators = 277, learning_rate = 0.8756659420813518)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = pd.DataFrame(clf.predict_proba(X_train))[0]

In [None]:
fpr1, tpr1, threshold1 = metrics.roc_curve(y_train, clf.predict_proba(X_train)[:,1])
fpr2, tpr2, threshold2 = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])

In [None]:
print("train_auc =",metrics.auc(fpr1,tpr1))
print("test_auc =",metrics.auc(fpr2,tpr2))

In [None]:
optuna.visualization.plot_optimization_history(study).show()

In [None]:
import joblib
joblib.dump(study,"studyada.pkl")

In [None]:
modada = joblib.load(r"C:\Users\mgaviria\Documents\Autocura\ETLS AC\Seleccion Hiperparams\studyada.pkl")

In [None]:
import optuna
optuna.visualization.plot_parallel_coordinate(modada)

In [None]:
optuna.visualization.plot_param_importances(modada)