In [1]:
import pandas as pd 

In [3]:
X0=pd.read_parquet('X_t0.parquet')
X1=pd.read_parquet('X_t1.parquet')
X2=pd.read_parquet('X_t2.parquet')
y0=pd.read_parquet('y_t0.parquet')
y1=pd.read_parquet('y_t1.parquet')

In [18]:
!pip install optuna




In [19]:
import optuna
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [12]:
ys = ['target']
binary_columns = ['market_ht_trendmode', 'unique_borrow_protocol_count', 'unique_lending_protocol_count']
id_columns = ['borrow_block_number']
wallet = ['wallet_address']
time_columns = [
    'risky_first_tx_timestamp', 'risky_last_tx_timestamp', 'risky_first_last_tx_timestamp_diff',
    'time_since_first_deposit', 'time_since_last_liquidated','borrow_timestamp', 'first_tx_timestamp', 'last_tx_timestamp']

# Columnas a escalar
columns_to_scale = list(X0.columns.difference(id_columns + binary_columns + time_columns + ys + wallet))

# Configurar el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), columns_to_scale),
        ('passthrough', 'passthrough', id_columns + binary_columns + time_columns)
    ]
)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X0, y0, test_size=0.3, stratify=y0,random_state=17)

In [None]:
pipeline = Pipeline([
    ('Preprocessing', preprocessor),
    ('XGBoost', XGBClassifier())
])

pipeline.fit(X_train, y_train)

In [32]:
X0p = preprocessor.fit_transform(X0)

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
import optuna

# Define tu función objetivo para maximizar aucpr
def objective_function(trial):
    # Define los hiperparámetros a optimizar
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False])
    }

    # Divide los datos en entrenamiento y validación
    X_train, X_test, y_train, y_test = train_test_split(X0p, y0, test_size=0.3, stratify=y0, random_state=17)

    # Entrena el modelo
    model = RandomForestClassifier(
        random_state=17,
        **params
    )
    
    model.fit(X_train, y_train)
    
    # Realiza predicciones de probabilidad
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calcula aucpr
    aucpr = average_precision_score(y_test, y_prob)

    return aucpr



In [52]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_function, n_trials=50)
print("Mejores parámetros:")
print(study.best_trial.params)

print(f"Mejor AUC-PR alcanzado: {study.best_trial.value}")

[I 2024-12-05 20:59:51,766] A new study created in memory with name: no-name-475aae2b-a032-499c-bfbc-03c1ac55b2c9
  return fit_method(estimator, *args, **kwargs)
[I 2024-12-05 21:00:04,060] Trial 0 finished with value: 0.8993577737663956 and parameters: {'n_estimators': 118, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.8993577737663956.
  return fit_method(estimator, *args, **kwargs)
[I 2024-12-05 21:00:09,670] Trial 1 finished with value: 0.8594807600517772 and parameters: {'n_estimators': 129, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.8993577737663956.
  return fit_method(estimator, *args, **kwargs)
[I 2024-12-05 21:01:58,193] Trial 2 finished with value: 0.8645629686019911 and parameters: {'n_estimators': 146, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': None, 'bo

Mejores parámetros:
{'n_estimators': 293, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': None, 'bootstrap': True}
Mejor AUC-PR alcanzado: 0.9176663335114493


In [57]:
from sklearn.metrics import average_precision_score
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import optuna

# Define tu función objetivo para maximizar aucpr
def objective_function(trial):
    # Define los hiperparámetros a optimizar
    params = {
        "objective": "binary:logistic",  # Para clasificación binaria
        "eval_metric": "aucpr",  # Métrica usada durante el entrenamiento
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "n_estimators": trial.suggest_int("n_estimators", 10, 300),
    }

    # Divide los datos en entrenamiento y validación
    X_train, X_test, y_train, y_test = train_test_split(X0p, y0, test_size=0.3, stratify=y0, random_state=17)

    # Entrena el modelo
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-aucpr")  # Cambio de clave

    model = XGBClassifier(seed=17,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=10,
        callbacks=[pruning_callback], **params)

    # Entrenar el modelo
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=False
    )
    
    # Realiza predicciones de probabilidad
    y_prob = model.predict_proba(X_test)[:, 1]  # Tomamos la probabilidad de la clase positiva
    
    # Calcula AUC-PR
    aucpr = average_precision_score(y_test, y_prob)
    
    return aucpr

# Crear el estudio Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective_function, n_trials=200)

# Mostrar los mejores parámetros y AUC-PR alcanzado
print("Mejores parámetros:")
print(study.best_trial.params)

print(f"Mejor AUC-PR alcanzado: {study.best_trial.value}")

[I 2024-12-05 22:10:29,934] A new study created in memory with name: no-name-f423a09f-82cc-49e1-ba32-76f16808176e
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
Parameters: { "eval_set" } are not used.

[I 2024-12-05 22:10:38,413] Trial 0 finished with value: 0.9654658630472769 and parameters: {'max_depth': 8, 'learning_rate': 0.09337870030465689, 'subsample': 0.7072729983555917, 'colsample_bytree': 0.7264884115076452, 'min_child_weight': 1, 'gamma': 0.9069145536274531, 'n_estimators': 245}. Best is trial 0 with value: 0.9654658630472769.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
Parameters: { "eval_set" } are not used.

[I 2024-12-05 22:10:39,381] Trial 1 finished with value: 0.8876095155821412 and parameters: {'max_depth': 4, 'learning_rate': 0.0011823867201554074, 'subsample': 0.6012746888526785, 'colsample_bytree': 0.5064299543445077, 'min_child_weight': 7, 'gamma': 0.4328180569789657, 'n_estimators': 250}. Best is trial 0 

Mejores parámetros:
{'max_depth': 8, 'learning_rate': 0.09337870030465689, 'subsample': 0.7072729983555917, 'colsample_bytree': 0.7264884115076452, 'min_child_weight': 1, 'gamma': 0.9069145536274531, 'n_estimators': 245}
Mejor AUC-PR alcanzado: 0.9654658630472769


In [58]:
# Definir los hiperparámetros de XGBoost
params = {
    'max_depth': 8,
    'learning_rate': 0.09337870030465689,
    'subsample': 0.7072729983555917,
    'colsample_bytree': 0.7264884115076452,
    'min_child_weight': 1,
    'gamma': 0.9069145536274531,
    'n_estimators': 245
}
# Dividir los datos en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X0, y0, test_size=0.3, stratify=y0, random_state=17)

# Crear el pipeline
pipeline = Pipeline([
    ('Preprocessing', preprocessor),
    ('classifier', XGBClassifier(**params, random_state=17))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Realizar predicciones de probabilidad
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Calcular la métrica AUC PR
aucpr = average_precision_score(y_test, y_prob)

# Imprimir el resultado
print(f"AUC PR: {aucpr}")

AUC PR: 0.9664470892298799


In [60]:
import joblib
joblib.dump(pipeline, 'model_v1.pkl')

['model_v1.pkl']

In [61]:
# Cargar el modelo guardado
pipeline_new = joblib.load('model_v1.pkl')

# Re-entrenar el modelo con nuevos datos
pipeline_new.fit(X1, y1)

# Realizar nuevas predicciones con el modelo re-entrenado
y_prob_new = pipeline_new.predict_proba(X_test)[:, 1]

# Recalcular la métrica AUC PR después del re-entrenamiento
aucpr_new = average_precision_score(y_test, y_prob_new)
print(f"AUC PR (after re-training): {aucpr_new}")

AUC PR (after re-training): 0.8652992031281962


In [64]:
X1p = preprocessor.fit_transform(X1)

In [None]:


# Función objetivo para optimización de hiperparámetros
def objective(trial):
    # Definir los hiperparámetros a optimizar
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'eval_metric': 'aucpr',  # Definir explícitamente la métrica de evaluación
    }

    # Dividir los datos en entrenamiento y test
    X_train, X_test, y_train, y_test = train_test_split(X1p, y1, test_size=0.3, stratify=y1, random_state=17)
    
    # Crear el pruning callback
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-aucpr")
    
    # Crear el modelo
    model = XGBClassifier(**params, random_state=17, early_stopping_rounds=10, eval_set=[(X_train, y_train), (X_test, y_test)], callbacks=[pruning_callback])

    # Entrenar el modelo
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    
    # Realiza predicciones de probabilidad
    y_prob = model.predict_proba(X_test)[:, 1]  # Tomamos la probabilidad de la clase positiva
    
    # Calcula AUC-PR
    aucpr = average_precision_score(y_test, y_prob)
    
    return aucpr

# Definir el estudio de Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)  # Realiza 200 pruebas de optimización

# Imprimir los mejores parámetros encontrados
print(f"Best hyperparameters: {study.best_params}")




[I 2024-12-05 22:39:44,421] A new study created in memory with name: no-name-c0b9a92b-c073-46ec-a278-1eae272089b3
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
Parameters: { "eval_set" } are not used.

[I 2024-12-05 22:40:01,754] Trial 0 finished with value: 0.8951240375354104 and parameters: {'max_depth': 10, 'learning_rate': 0.022481281929030508, 'subsample': 0.9123814456333867, 'colsample_bytree': 0.8400284872217928, 'min_child_weight': 1, 'gamma': 0.7667119597850548, 'n_estimators': 100}. Best is trial 0 with value: 0.8951240375354104.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
Parameters: { "eval_set" } are not used.

[I 2024-12-05 22:40:04,120] Trial 1 finished with value: 0.8348542840687522 and parameters: {'max_depth': 6, 'learning_rate': 0.0006488717576368633, 'subsample': 0.568407899778659, 'colsample_bytree': 0.7824723944681694, 'min_child_weight': 4, 'gamma': 0.0729607482110769, 'n_estimators': 248}. Best is trial 0

Best hyperparameters: {'max_depth': 12, 'learning_rate': 0.027125110612932147, 'subsample': 0.9707007698097644, 'colsample_bytree': 0.6918333714346909, 'min_child_weight': 1, 'gamma': 0.4746767211591132, 'n_estimators': 270}


ValueError: Specifying the columns using strings is only supported for dataframes.

In [70]:
# Entrenar el modelo final con los mejores parámetros
best_params = study.best_params
pipeline = Pipeline([
    ('Preprocessing', preprocessor),
    ('classifier', XGBClassifier(**best_params, random_state=17))
])

# Entrenamiento final
pipeline.fit(X_train, y_train)

# Guardar el modelo reentrenado
joblib.dump(pipeline, 'model_retrained.pkl')


['model_retrained.pkl']

In [71]:
y_prob_new2 = pipeline.predict_proba(X_test)[:, 1]

# Recalcular la métrica AUC PR después del re-entrenamiento
aucpr_new = average_precision_score(y_test, y_prob_new2)
print(f"AUC PR (after re-training): {aucpr_new}")

AUC PR (after re-training): 0.964391042642574


# Exportar para codalab

In [72]:
from zipfile import ZipFile
import os
def generateFiles(predict_data, clf_pipe):
    """Genera los archivos a subir en CodaLab

    Input
    ---------------
    predict_data: Dataframe con los datos de entrada a predecir
    clf_pipe: pipeline del clf

    Ouput
    ---------------
    archivo de txt
    """
    y_pred_clf = clf_pipe.predict_proba(predict_data)[:, 1]
    with open('./predictions.txt', 'w') as f:
        for item in y_pred_clf:
            f.write("%s\n" % item)
    
    with ZipFile('predictions.zip', 'w') as zipObj:
        zipObj.write('predictions.txt')
    os.remove('predictions.txt')

generateFiles(X2, pipeline)