# 🧪 Optimización de CatBoost - Dos Estrategias (Corregido Final)

Este notebook compara dos estrategias para optimizar CatBoost:

1. 🧱 CatBoost con preprocesamiento (OneHot + escalado) y GridSearchCV  
2. 🔥 CatBoost con manejo nativo de categóricas (sin preprocesar)

Se conserva `Tamano_empresa` para evitar errores con columnas categóricas.


In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
import warnings
warnings.filterwarnings("ignore")

# --- CARGA Y LIMPIEZA ---
df = pd.read_csv("./data/survey_results_public.csv")
columnas = {
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
}
df = df[list(columnas.keys())].rename(columns=columnas)
df = df[df["Salario_anual"].between(10000, 300000)]
df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")
top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")
roles = df["Rol"].str.get_dummies(sep=";")
df = pd.concat([df.drop(columns=["Rol"]), roles], axis=1)
df = df.dropna()
df["Salario_anual_log"] = np.log1p(df["Salario_anual"])
df["Anios_experiencia_log"] = np.log1p(df["Anios_experiencia"])
df["Anios_experiencia_cuadrado"] = df["Anios_experiencia"] ** 2
orden_empresa = [
    "Just me - I am a freelancer, sole proprietor, etc.",
    "2 to 9 employees",
    "10 to 19 employees",
    "20 to 99 employees",
    "100 to 499 employees",
    "500 to 999 employees",
    "1,000 to 4,999 employees",
    "5,000 to 9,999 employees",
    "10,000 or more employees"
]
df = df[df["Tamano_empresa"].isin(orden_empresa)]
df["Tamano_empresa_ordinal"] = df["Tamano_empresa"].apply(lambda x: orden_empresa.index(x))

# Guardar copia completa ANTES de eliminar columnas para método 2
df_raw = df.copy()

# --- MÉTODO 1: CATBOOST + PREPROCESADO + GRIDSEARCH ---
X = df.drop(columns=["Salario_anual", "Salario_anual_log", "Tamano_empresa"])
y = df["Salario_anual_log"]
cat_cols = ["Tipo_empleo", "RemoteWork", "Pais", "Nivel_educativo"]
num_cols = [col for col in X.columns if col not in cat_cols]
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", CatBoostRegressor(verbose=0, random_state=42))
])

param_grid = {
    "regressor__iterations": [500],
    "regressor__learning_rate": [0.03, 0.05],
    "regressor__depth": [6, 8],
    "regressor__l2_leaf_reg": [1, 3],
    "regressor__bagging_temperature": [0.5, 1]
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring="r2", verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

print("📊 Mejor configuración CatBoost (preprocesado):")
print(grid.best_params_)
print("R² en test:", r2_score(y_test, grid.predict(X_test)))

# --- MÉTODO 2: CATBOOST NATIVO ---
X_cb = df_raw.drop(columns=["Salario_anual", "Salario_anual_log"])
y_cb = df_raw["Salario_anual_log"]
cat_features = ["Tipo_empleo", "Trabajo_remoto", "Pais", "Nivel_educativo", "Tamano_empresa"]

X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(
    X_cb, y_cb, test_size=0.2, random_state=42
)

train_pool = Pool(data=X_train_cb, label=y_train_cb, cat_features=cat_features)
test_pool = Pool(data=X_test_cb, label=y_test_cb, cat_features=cat_features)

modelo_cb = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    random_state=42,
    verbose=100
)

modelo_cb.fit(train_pool)
y_pred_cb = modelo_cb.predict(test_pool)
print("R² CatBoost sin preprocesar:", r2_score(y_test_cb, y_pred_cb))


Fitting 3 folds for each of 16 candidates, totalling 48 fits


ValueError: 
All the 48 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'RemoteWork'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_indexing.py", line 364, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'RemoteWork'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 992, in fit_transform
    self._validate_column_callables(X)
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 551, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\javid\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_indexing.py", line 372, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
