In [35]:
# Libreria Core del lab.
import numpy as np
import pandas as pd
import datetime
from IPython.display import HTML

# Libreria para plotear (En colab esta desactualizado plotly)
!pip install --upgrade plotly
import plotly.express as px
import plotly.graph_objects as go

# Librerias utiles
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, OneHotEncoder, FunctionTransformer

In [36]:
X_t0 = pd.read_csv('X_t0')
y_t0 = pd.read_csv('y_t0')
X_t1 = pd.read_csv('X_t1')
y_t1 = pd.read_csv('y_t1')
X_t2 = pd.read_csv('X_t2')
X_t3 = pd.read_csv('X_t3')




In [37]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_num = X_t0.select_dtypes(include=numerics) # df auxiliar para imprimir columnas numéricas

numerical_columns = list(df_num.columns)
numerical_columns

['DaysSinceJob',
 'CreditCap',
 'Speed24h',
 'AliveSession',
 'BankSpots8w',
 'HustleMinutes',
 'RiskScore',
 'AliasMatch',
 'DeviceEmails8w',
 'HustleMonth',
 'ZipHustle',
 'Speed4w',
 'income',
 'FreeMail',
 'HomePhoneCheck',
 'BankMonths',
 'DOBEmails4w',
 'ForeignHustle',
 'DeviceScams',
 'OldHoodMonths',
 'intended_balcon_amount',
 'NewCribMonths',
 'Speed6h',
 'CellPhoneCheck',
 'customer_age',
 'ExtraPlastic']

In [38]:
df_obj = X_t0.select_dtypes(include='object') # df auxiliar para imprimir columnas de objetos

categorical_columns = list(df_obj.columns)
categorical_columns

['JobStatus', 'CribStatus', 'LootMethod', 'InfoSource', 'DeviceOS']

In [39]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Pipelines para columnas numéricas y categóricas con VarianceThreshold
numeric_transformations = Pipeline([
    ('scaler', MinMaxScaler()),
    ('variance_threshold', VarianceThreshold(threshold=0)) # Para eliminar DeviceScams
])

categoric_transformations = Pipeline([
    ('category_one_hot', OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
    ('variance_threshold', VarianceThreshold(threshold=0))
])

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numeric_transformations, numerical_columns),
    ('categorical', categoric_transformations, categorical_columns)
])

# Aplicar el preprocesamiento
df_pro = preprocessor.fit_transform(X_t0)

# Recuperar los nombres de las nuevas columnas
ohe_feature_names = preprocessor.named_transformers_['categorical'].named_steps['category_one_hot'].get_feature_names_out(categorical_columns)
final_columns = numerical_columns + list(ohe_feature_names)

# Dado que VarianceThreshold puede eliminar algunas características, necesitamos actualizar los nombres de las columnas
# Primero, debemos verificar cuántas características fueron eliminadas por VarianceThreshold
num_features_after_numeric_threshold = preprocessor.named_transformers_['numerical'].named_steps['variance_threshold'].get_support(indices=True)
num_features_after_categorical_threshold = preprocessor.named_transformers_['categorical'].named_steps['variance_threshold'].get_support(indices=True)

# Filtrar las columnas finales
final_columns_numeric = [numerical_columns[i] for i in num_features_after_numeric_threshold]
final_columns_categorical = [ohe_feature_names[i] for i in num_features_after_categorical_threshold]
final_columns = final_columns_numeric + final_columns_categorical

# Convertir el resultado a un DataFrame
df_pro = pd.DataFrame(df_pro, columns=final_columns)



In [40]:
best_params_hip={
'learning_rate': 0.06802202044451305,
'n_estimators': 667,
'max_depth': 5,
'max_leaves': 3,
'min_child_weight': 2,
'reg_alpha': 0.9031319828024397,
'reg_lambda': 0.4494141630778533,
'subsample': 0.5045063160719112,
'colsample_bytree': 0.83259315165401,
'gamma': 4.55839488003589,
'scale_pos_weight': 1.532161973191951
}

In [41]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, precision_recall_curve, auc

XGB_pipe_final_hip = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', **best_params_hip))
])

# Datos iniciales
X_train, X_test, y_train, y_test = train_test_split(X_t0, y_t0.loc[:, 'is_mob'], test_size=0.3, random_state=42)

# Entrenar el modelo inicial
XGB_pipe_final_hip.fit(X_train, y_train)

# Guardar el modelo inicial
XGB_pipe_final_hip.named_steps['classifier'].save_model('xgb_initial.model')

# Evaluar el modelo inicial
y_prob = XGB_pipe_final_hip.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_pr = average_precision_score(y_test, y_prob)
print(f'AUC-PR del modelo inicial: {auc_pr:.2f}')



AUC-PR del modelo inicial: 0.16


In [42]:
# Supongamos que ya tienes X_t1 e y_t1 como tus nuevos datos
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_t1, y_t1.loc[:, 'is_mob'], test_size=0.3, random_state=42)

# Cargar el modelo inicial
model = xgb.Booster()
model.load_model('xgb_initial.model')

# Convertir los nuevos datos usando el preprocesador del pipeline
X_new_train_preprocessed = XGB_pipe_final_hip.named_steps['preprocessor'].transform(X_new_train)
X_new_test_preprocessed = XGB_pipe_final_hip.named_steps['preprocessor'].transform(X_new_test)

# Crear DMatrix para los nuevos datos
dtrain_new = xgb.DMatrix(X_new_train_preprocessed, label=y_new_train)
dtest_new = xgb.DMatrix(X_new_test_preprocessed, label=y_new_test)

# Reentrenar el modelo con los nuevos datos
params = XGB_pipe_final_hip.named_steps['classifier'].get_params()
params['use_label_encoder'] = False
params['eval_metric'] = 'logloss'
num_boost_round_new = 50
model = xgb.train(params, dtrain_new, num_boost_round=num_boost_round_new, xgb_model=model)

# Guardar el modelo actualizado
model.save_model('xgb_updated.model')

# Evaluar el modelo reentrenado
y_new_prob = model.predict(dtest_new)
precision_new, recall_new, _ = precision_recall_curve(y_new_test, y_new_prob)
auc_pr_new = average_precision_score(y_new_test, y_new_prob)
print(f'AUC-PR del modelo reentrenado: {auc_pr_new:.2f}')

Parameters: { "enable_categorical", "missing", "n_estimators", "use_label_encoder" } are not used.
"


AUC-PR del modelo reentrenado: 0.18




In [43]:
import joblib

# Guardar el pipeline completo después del entrenamiento inicial
XGB_pipe_final_hip.fit(X_train, y_train)
joblib.dump(XGB_pipe_final_hip, 'xgb_pipeline_initial.joblib')



['xgb_pipeline_initial.joblib']

In [44]:
# Cargar el pipeline completo
XGB_pipe_final_hip = joblib.load('xgb_pipeline_initial.joblib')

X_combined = pd.concat([X_t0, X_t1], axis=0)
y_combined = pd.concat([y_t0.loc[:, 'is_mob'], y_t1.loc[:, 'is_mob']], axis=0)

# Preprocesar los datos combinados
X_combined_preprocessed = XGB_pipe_final_hip.named_steps['preprocessor'].transform(X_combined)

# Crear DMatrix para los datos combinados
dtrain_combined = xgb.DMatrix(X_combined_preprocessed, label=y_combined)

# Obtener los parámetros del modelo inicial
params = XGB_pipe_final_hip.named_steps['classifier'].get_params()
params['use_label_encoder'] = False
params['eval_metric'] = 'logloss'

# Reentrenar el modelo con los datos combinados
num_boost_round_new = 50
model = xgb.train(params, dtrain_combined, num_boost_round=num_boost_round_new, xgb_model=model)

# Guardar el modelo actualizado en el pipeline
XGB_pipe_final_hip.named_steps['classifier'].save_model('xgb_updated.model')

# Guardar el pipeline completo nuevamente
joblib.dump(XGB_pipe_final_hip, 'xgb_pipeline_updated.joblib')

Parameters: { "enable_categorical", "missing", "n_estimators", "use_label_encoder" } are not used.
"


['xgb_pipeline_updated.joblib']

In [45]:

import mlflow

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [46]:
import os
import joblib
import mlflow
import mlflow.xgboost
from sklearn.pipeline import Pipeline
import xgboost as xgb
import pandas as pd
import optuna
from optuna import Trial
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from mlflow.data.pandas_dataset import PandasDataset

def optimize_hyperparameters_with_mlflow(X, y, preprocessor):
    def objective2(trial: Trial):
        # Definir los hiperparámetros a ajustar
        xgb_params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'max_leaves': trial.suggest_int('max_leaves', 0, 100),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 10)
        }

        # Crear el pipeline
        XGB_pipe = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', **xgb_params))
        ])

        # Validación cruzada
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        y_pred = cross_val_predict(XGB_pipe, X, y, cv=cv, method='predict_proba')[:, 1]

        # Calcular el AUC PR
        precision, recall, _ = precision_recall_curve(y, y_pred)
        auc_pr = auc(recall, precision)

        # Rastrear los hiperparámetros y la métrica con MLFlow
        mlflow.log_params(xgb_params)
        mlflow.log_metric("auc_pr", auc_pr)

        return auc_pr

    # Ejecutar la optimización con Optuna y MLFlow
    study = optuna.create_study(direction='maximize')
    with mlflow.start_run(run_name="optimize_hyperparameters", nested=True):
        study.optimize(objective2, n_trials=1)

    # Obtener el AUC PR y los mejores hiperparámetros encontrados
    best_auc = study.best_value
    best_params = study.best_params
    num_trials = len(study.trials)
    best_model = study.best_trial

    print(f'Número de trials: {num_trials}')
    print(f'AUC PR: {best_auc}')
    print('Mejores hiperparámetros encontrados:')
    for key, value in best_params.items():
        print(f'{key}: {value}')

    return best_params,best_auc

dataset_X_old = mlflow.data.from_pandas(X_old, name="data train X nueva")
                mlflow.log_input(dataset_X_old, context="training")
                dataset_y_old = mlflow.data.from_pandas(y_old, name="data train y nueva")
                mlflow.log_input(dataset_y_old, context="training")
                dataset_X_new = mlflow.data.from_pandas(X_new, name="data train X nueva")
                mlflow.log_input(dataset_X_new, context="training")
                dataset_y_new = mlflow.data.from_pandas(y_new, name="data train y nueva")
                mlflow.log_input(dataset_y_new, context="training")

In [47]:
from datetime import datetime

# Obtener la fecha y hora actual


# Crear el nombre del run


def retrain_model(X_old, y_old, X_new, y_new, preprocessor, model_path='xgb_pipeline.joblib'):
    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    run_name = f"incremental_training_{current_time}"
    # Finalizar cualquier ejecución activa de MLFlow
    if mlflow.active_run():
        mlflow.end_run()

    # Combinar los datos antiguos y nuevos
    X_combined = pd.concat([X_old, X_new], axis=0)
    y_combined = pd.concat([y_old, y_new], axis=0)
    

    if not os.path.exists(model_path):
        with mlflow.start_run(run_name=run_name):
            try:
                # Initial training
                params,auclog = optimize_hyperparameters_with_mlflow(X_old, y_old, preprocessor)
                
                #mlflow.log_params(params)
                #mlflow.log_metric("aucpr", auclog)
                XGB_pipe = Pipeline([
                    ("preprocessor", preprocessor),
                    ("classifier", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', **params))
                ])
                XGB_pipe.fit(X_combined, y_combined)
                joblib.dump(XGB_pipe, model_path)
                mlflow.autolog()

            finally:
                mlflow.end_run()
    else:
        with mlflow.start_run(run_name=run_name):
            try:
                # Cargar el pipeline completo
                XGB_pipe = joblib.load(model_path)
                classifier = XGB_pipe.named_steps['classifier']
                
                # Reentrenar el modelo con los datos combinados
                X_combined_preprocessed = XGB_pipe.named_steps['preprocessor'].transform(X_combined)
                dtrain_combined = xgb.DMatrix(X_combined_preprocessed, label=y_combined)
                params,auclog = optimize_hyperparameters_with_mlflow(X_combined, y_combined, preprocessor)
                #mlflow.log_params(params)
                #mlflow.log_metric("aucpr", auclog)
                
                classifier = xgb.train(params, dtrain_combined, num_boost_round=50, xgb_model=classifier.get_booster())
                XGB_pipe.named_steps['classifier'] = classifier
                joblib.dump(XGB_pipe, model_path)
                mlflow.autolog()
                mlflow.sklearn.log_model(XGB_pipe, "model_updated")
            finally:
                mlflow.end_run()

    return XGB_pipe

In [48]:
import mlflow
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart2")

<Experiment: artifact_location='mlflow-artifacts:/843434259623179817', creation_time=1721284316795, experiment_id='843434259623179817', last_update_time=1721284316795, lifecycle_stage='active', name='MLflow Quickstart2', tags={}>

In [52]:
# Ejemplo de uso
XGB_pipe_final = retrain_model(X_t0, y_t0.loc[:, 'is_mob'], X_t1, y_t1.loc[:, 'is_mob'], preprocessor, model_path='xgb_pipeline_initial.joblib')

[I 2024-07-18 02:47:17,208] A new study created in memory with name: no-name-0608a415-c257-4358-aa8c-44a99a19566e
[W 2024-07-18 02:48:21,293] Trial 0 failed with parameters: {'learning_rate': 0.0019937331866392493, 'n_estimators': 698, 'max_depth': 5, 'max_leaves': 39, 'min_child_weight': 4, 'reg_alpha': 0.16459254353651964, 'reg_lambda': 0.09287101134852804, 'subsample': 0.9922143928936906, 'colsample_bytree': 0.7750900785648521, 'gamma': 4.736653473479915, 'scale_pos_weight': 9.185292402030328} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\jeus8\Laboratorios-MDS\0labs.venv\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\jeus8\AppData\Local\Temp\ipykernel_5472\2430157338.py", line 39, in objective2
    y_pred = cross_val_predict(XGB_pipe, X, y, cv=cv, method='predict_proba')[:, 1]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [50]:
def predapi(data):
    datadf=pd.read_csv(data)
    ypred=XGB_pipe_final.predict(datadf)
    ypred_df = pd.DataFrame(ypred, columns=["Prediction"])
    dataypred=datadf.copy()
    dataypred['label_predicted']=ypred
    return ypred_df

In [51]:
import gradio as gr

demo = gr.Interface(fn = predapi, # noten como estamos usando la función que generamos anteriormente
                    inputs=gr.File(type="filepath"), 
                    outputs=gr.DataFrame()) # valor de salida

demo.launch(share = True)

Running on local URL:  http://127.0.0.1:7868

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


