**Step 0: Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

import joblib

**Step 1: Data loading, transformation and cleaning**

In [2]:
# Carga y transformación
df = pd.read_csv("./data/survey_results_public.csv", low_memory=False)

df = df.rename(columns={
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
})

columnas_a_mantener = ['Tipo_empleo', 'Trabajo_remoto', 'Rol', 'Nivel_educativo', 'Anios_experiencia', 'Pais', 'Tamano_empresa', 'Salario_anual']
df = df[columnas_a_mantener]


In [3]:
df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")
df["Tamano_empresa"] = df["Tamano_empresa"].replace("I don’t know", pd.NA)

In [4]:
df = df.dropna()
df = df[df["Salario_anual"] <= 300000]

**Step 2: Processing the 'Role', 'Country' and 'Company Size' columns**

In [5]:
# Rol
roles_dummies = df["Rol"].str.get_dummies(sep=";")
top_roles = roles_dummies.sum().sort_values(ascending=False).head(15).index
df = pd.concat([df.drop(columns="Rol"), roles_dummies[top_roles]], axis=1)

# País
top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")
df = pd.get_dummies(df, columns=["Pais"], drop_first=False)
if "Pais_Otro" in df.columns:
    df = df.drop(columns=["Pais_Otro"])

# Resto
df = pd.get_dummies(df, columns=["Tipo_empleo", "Trabajo_remoto", "Nivel_educativo"], drop_first=False)

orden_empresa = [
    "Just me - I am a freelancer, sole proprietor, etc.",
    "2 to 9 employees",
    "10 to 19 employees",
    "20 to 99 employees",
    "100 to 499 employees",
    "500 to 999 employees",
    "1,000 to 4,999 employees",
    "5,000 to 9,999 employees",
    "10,000 or more employees"
]
df["Tamano_empresa"] = df["Tamano_empresa"].apply(lambda x: orden_empresa.index(x))

**Step 3: Categorical feature encoding**

In [6]:
for col in df.columns:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)

**Step 4: Final cleaning and preparation**

In [7]:
# Preparar datos finales
X = df.drop(columns="Salario_anual")
X.columns = [re.sub(r"[^a-zA-Z0-9]", "_", col) for col in X.columns]
y = df["Salario_anual"]

**Step 5: Train/test split**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

**Step 6: Hyperparameter grid definition**

In [None]:
param_grid = {
    "iterations": [800],
    "learning_rate": [0.03],
    "depth": [6],
    "l2_leaf_reg": [3],
    "bagging_temperature": [0.2]
}

**Step 7: GridSearchCV and CatBoost training**

In [None]:
model = CatBoostRegressor(verbose=0, random_state=42)

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="r2",
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Mejor configuración encontrada:")
print(grid.best_params_)


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Mejor configuración encontrada:
{'bagging_temperature': 0.2, 'depth': 6, 'iterations': 800, 'l2_leaf_reg': 3, 'learning_rate': 0.03}


**Step 8: Test evaluation**

In [10]:
# Evaluación en test
mejor_modelo = grid.best_estimator_
y_pred = mejor_modelo.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"R² en test con mejores hiperparámetros: {r2:.4f}")


R² en test con mejores hiperparámetros: 0.6001


**Step 9: Model saving**

In [11]:
joblib.dump(mejor_modelo, "model.pkl")

['model.pkl']