In [8]:
import pandas as pd
import json
import time
from pathlib import Path

import preprocessing.preprocess_pipeline as pp
from preprocessing.pipeline_io import save_pipeline, load_pipeline

from optuna import load_study

from train_catboost_optuna import train_and_save_best, detect_cat_cols

In [None]:
artifacts = Path('artifacts')
artifacts.mkdir(parents=True, exist_ok=True)

TRAIN_DATA_PATH = 'train_data.jsonlines'

PIPELINE_PATH = artifacts / 'prep_v1'

COLS_TO_DROP_EARLY = ['subtitle','differential_pricing','international_delivery_mode','listing_source','site_id', 'coverage_areas']
DATE_COLS = ['date_created','last_updated']
TARGET = "condition"

STORAGE_PASSWORD = ''

In [None]:
df_train = pd.read_json(
    TRAIN_DATA_PATH, 
    lines=True, 
    convert_dates=DATE_COLS
)

df_train = df_train.drop(columns=COLS_TO_DROP_EARLY).copy()

In [20]:
pipe = pp.FeaturePipeline().fit(df_train)
save_pipeline(pipe, out_dir=PIPELINE_PATH)

In [21]:
pipe_loaded, _ = load_pipeline(PIPELINE_PATH)
y = df_train[TARGET].eq('new').astype(int)                
X = pipe_loaded.transform(df_train)

In [None]:
cat_cols = detect_cat_cols(X, exclude=[])
other_cat_cols = ['listing_type_id', 'category_level1', 'price_bin10', 'category_id', 'seller_id']
cat_cols = list(dict.fromkeys(cat_cols + other_cat_cols))

X = X.loc[:, X.notna().any(axis=0)].copy()
for c in cat_cols:
    if c in X.columns:
        X[c] = X[c].astype("object")

In [None]:
study = load_study(
    study_name='meli_used_new_continuous_2025-11-08',
    storage=f'postgresql://postgres.jzhaeqvhtmyskmwhhfjz:{STORAGE_PASSWORD}@aws-1-us-east-2.pooler.supabase.com:6543/postgres',
)

best_params = study.best_trial.params

(artifacts / "best_params.json").write_text(json.dumps(best_params, indent=2))
(artifacts / "study_best.json").write_text(json.dumps({
    "value": study.best_value,
    "number": study.best_trial.number,
    "user_attrs": study.best_trial.user_attrs,
}, indent=2))

In [None]:
# Normalize best params for CatBoost (explicit Nones)
if "weighting_mode" in best_params:
    wm = best_params["weighting_mode"]
    if wm == "none":
        best_params["auto_class_weights"] = None
        best_params["scale_pos_weight"] = None
    elif wm == "auto_balanced":
        best_params["auto_class_weights"] = "Balanced"
        best_params["scale_pos_weight"] = None
    elif wm == "scale_pos_weight":
        best_params["auto_class_weights"] = None
    del best_params["weighting_mode"]

In [None]:
best_params

In [None]:
summary = train_and_save_best(
    X=X,
    y=y,
    groups=None,
    cat_cols=cat_cols,
    best_params=best_params,
    n_splits=5,
    seed=int(time.time()) % 10_000_000,
    early_stopping_rounds=200,
    thread_count=-1,
    artifacts_dir=artifacts,
    log_every_iter=100,
)

In [None]:
print("\n=== Done (Training) ===")
print("CV mean accuracy:", summary["cv_metrics_mean"].get("accuracy"))
print(f"Threshold metric: {summary['threshold_metric']}")
print("Best threshold:", summary["best_threshold"], "Value:", summary["best_threshold_value"])
if summary["oof_metrics_calibrated"] is not None:
    print("OOF metrics (calibrated):", summary["oof_metrics_calibrated"])
    print("Best threshold (calibrated):", summary["best_threshold_calibrated"], "Value:", summary["best_threshold_calibrated_value"])
if summary.get("segment_threshold_cols"):
    print("Segment thresholds learned for:", summary["segment_threshold_cols"])