# 🚀 Comparativa de Estrategias con CatBoost

Este notebook aplica y compara dos estrategias para entrenar un modelo CatBoost para predecir salario:

1. **CatBoost con preprocesamiento (OneHot + escalado) + GridSearchCV**
2. **CatBoost con categóricas nativas (sin preprocesamiento)**

📁 El dataset debe estar ubicado en `./data/survey_results_public.csv`


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("./data/survey_results_public.csv")

In [3]:
df = df.rename(columns={
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
})

In [4]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Age,Tipo_empleo,Trabajo_remoto,Check,CodingActivities,Nivel_educativo,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,Salario_anual,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


In [5]:
# df = df[df["Salario_anual"].between(10000, 300000)]

df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")
# 
top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")


In [6]:
df.shape

(65437, 114)

In [None]:
df = df.dropna()
roles = df["Rol"].str.get_dummies(sep=";")
df = pd.concat([df.drop(columns="Rol"), roles], axis=1)

In [None]:
df["Salario_anual_log"] = np.log1p(df["Salario_anual"])
df["Anios_experiencia_log"] = np.log1p(df["Anios_experiencia"])
df["Anios_experiencia_cuadrado"] = df["Anios_experiencia"] ** 2

orden = [
    "Just me - I am a freelancer, sole proprietor, etc.",
    "2 to 9 employees",
    "10 to 19 employees",
    "20 to 99 employees",
    "100 to 499 employees",
    "500 to 999 employees",
    "1,000 to 4,999 employees",
    "5,000 to 9,999 employees",
    "10,000 or more employees"
]

df = df[df["Tamano_empresa"].isin(orden)]
df["Tamano_empresa_ordinal"] = df["Tamano_empresa"].apply(lambda x: orden.index(x))

# Guardamos copia para el modelo nativo
df_raw = df.copy()


## 🔧 CatBoost con preprocesamiento manual + GridSearchCV

In [None]:
X = df.drop(columns=["Salario_anual", "Salario_anual_log", "Tamano_empresa"])
y = df["Salario_anual_log"]

print("Tamaño de X:", X.shape)
print("Columnas categóricas incluidas: Tipo_empleo, Trabajo_remoto, Pais, Nivel_educativo")


In [None]:
cat_cols = ["Tipo_empleo", "Trabajo_remoto", "Pais", "Nivel_educativo"]
num_cols = [col for col in X.columns if col not in cat_cols]

cat_pipeline = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])

pipe = Pipeline([
    ("pre", preprocessor),
    ("regressor", CatBoostRegressor(verbose=0, random_state=42))
])

params = {
    "regressor__iterations": [500],
    "regressor__learning_rate": [0.03, 0.05],
    "regressor__depth": [6, 8],
    "regressor__l2_leaf_reg": [1, 3]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

grid = GridSearchCV(pipe, params, cv=3, scoring="r2", verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

print("Mejor configuración:", grid.best_params_)
print("R² en test (preprocesado):", r2_score(y_test, grid.predict(X_test)))


## 🧠 CatBoost con categóricas nativas (sin preprocesamiento)

In [None]:
X_cb = df_raw.drop(columns=["Salario_anual", "Salario_anual_log"])
y_cb = df_raw["Salario_anual_log"]

cat_features = ["Tipo_empleo", "Trabajo_remoto", "Pais", "Nivel_educativo", "Tamano_empresa"]

X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cb, y_cb, random_state=42)

pool_train = Pool(X_train_cb, y_train_cb, cat_features=cat_features)
pool_test = Pool(X_test_cb, y_test_cb, cat_features=cat_features)

modelo_cb = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    random_state=42,
    verbose=100
)

modelo_cb.fit(pool_train)
y_pred_cb = modelo_cb.predict(pool_test)

print("R² en test (CatBoost sin preprocesar):", r2_score(y_test_cb, y_pred_cb))
