**Paso 0: Librerias**

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

import joblib

**Paso 1: Limpieza avanzada y estandarización de variables**

In [None]:
df = pd.read_csv("../data_sample/data_sample.csv")

df = df.rename(columns={
    "Employment": "TipoEmpleo",
    "RemoteWork": "TrabajoRemoto",
    "DevType": "Rol",
    "EdLevel": "NivelEducativo",
    "YearsCodePro": "AniosExperiencia",
    "Country": "Pais",
    "OrgSize": "TamanoEmpresa",
    "ConvertedCompYearly": "SalarioAnual"
})

columnas_utiles = [
    "TipoEmpleo", "TrabajoRemoto", "Rol", "NivelEducativo",
    "AniosExperiencia", "Pais", "TamanoEmpresa", "SalarioAnual"
]
df = df[columnas_utiles]

df["AniosExperiencia"] = df["AniosExperiencia"].replace({
    "Less than 1 year": "0",
    "More than 50 years": "51"
})
df["AniosExperiencia"] = pd.to_numeric(df["AniosExperiencia"], errors="coerce")

top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")

def simplificar_tipo_empleo(valor):
    if pd.isna(valor): return None
    if "full-time" in valor: return "JornadaCompleta"
    if "part-time" in valor: return "MediaJornada"
    if "freelancer" in valor or "contractor" in valor: return "Autonomo"
    if "student" in valor: return "Estudiante"
    if "not employed" in valor: return "Desempleado"
    if "retired" in valor: return "Jubilado"
    return "Otro"

df["TipoEmpleo"] = df["TipoEmpleo"].apply(simplificar_tipo_empleo)

df["TrabajoRemoto"] = df["TrabajoRemoto"].replace({
    "Remote": "Remoto",
    "In-person": "Presencial",
    "Hybrid (some remote, some in-person)": "Hibrido"
})

def mapear_rol(rol_raw):
    mapa_roles = {
        "Developer, full-stack": "Developer_Full_Stack",
        "Developer, back-end": "Developer_Back_End",
        "Developer, front-end": "Developer_Front_End",
        "Developer, desktop or enterprise applications": "Developer_Desktop_Enterprise",
        "Developer, mobile": "Developer_Mobile",
        "Developer, embedded applications or devices": "Developer_Embedded_Devices",
        "Data engineer": "Data_Engineer",
        "Engineering manager": "Engineering_Manager",
        "DevOps specialist": "DevOps_Specialist",
        "Data scientist or machine learning specialist": "Data_Scientist_ML",
        "Research & Development role": "Investigación_Desarrollo",
        "Academic researcher": "Investigador_Academico",
        "Cloud infrastructure engineer": "Cloud_Infrastructure_Engineer",
        "Senior Executive (C-Suite, VP, etc.)": "Senior_Executive",
    }

    if pd.isna(rol_raw):
        return "Otro_rol"

    primer_rol = rol_raw.split(";")[0].strip()

    return mapa_roles.get(primer_rol, "Otro_rol")

df["Rol"] = df["Rol"].apply(mapear_rol)
o
def simplificar_nivel(nivel):
    if pd.isna(nivel): return None
    if "Bachelor" in nivel: return "Grado_universitario"
    if "Master" in nivel: return "Master"
    if "Professional" in nivel or "Ph.D" in nivel or "Doctoral" in nivel: return "Doctorado"
    if "Secondary" in nivel: return "Secundaria"
    if "Primary" in nivel: return "Primaria"
    if "Associate" in nivel: return "Grado_medio"
    if "Some college" in nivel: return "Universidad_sin_titulo"
    return "Otro_nivel_educativo"

df["NivelEducativo"] = df["NivelEducativo"].apply(simplificar_nivel)

df = df[df["SalarioAnual"] <= 300000]


**Paso 2: Procesamiento de la columna 'Rol' y 'Pais' y 'Tamano_empresa'**

In [None]:
df["AniosExperiencia"] = df["AniosExperiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["AniosExperiencia"] = pd.to_numeric(df["AniosExperiencia"], errors="coerce")
df["TamanoEmpresa"] = df["TamanoEmpresa"].replace("I don’t know", pd.NA)

In [None]:
df = df.dropna()
df = df[df["SalarioAnual"] <= 300000]

**Paso 2: Procesamiento de la columna 'Rol' y 'Pais' y 'Tamano_empresa'**

In [None]:
roles_dummies = df["Rol"].str.get_dummies(sep=";")
top_roles = roles_dummies.sum().sort_values(ascending=False).head(15).index
df = pd.concat([df.drop(columns="Rol"), roles_dummies[top_roles]], axis=1)

top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro_pais")

df = pd.get_dummies(df, columns=["Pais", "TipoEmpleo", "TrabajoRemoto", "NivelEducativo"], prefix='', prefix_sep='')

if "Otro_pais" in df.columns:
    df = df.drop(columns=["Otro_pais"])

def traducir_tamano_empresa(x):
    traducciones = {
        "Just me - I am a freelancer, sole proprietor, etc.": "Freelance",
        "2 to 9 employees": "2-9_empleados",
        "10 to 19 employees": "10-19_empleados",
        "20 to 99 employees": "20-99_empleados",
        "100 to 499 employees": "100-499_empleados",
        "500 to 999 employees": "500-999_empleados",
        "1,000 to 4,999 employees": "1000-4999_empleados",
        "5,000 to 9,999 employees": "5000-9999_empleados",
        "10,000 or more employees": "+10000_empleados"
    }
    return traducciones.get(x, x)

df["TamanoEmpresa"] = df["TamanoEmpresa"].apply(traducir_tamano_empresa)

**Paso 3: Codificación de columnas categóricas**

In [None]:
for col in df.select_dtypes(include="bool").columns:
    df[col] = df[col].astype(int)

**Paso 4: Limpieza y preparación final**

In [None]:
X = df.drop(columns="SalarioAnual")
X.columns = [re.sub(r"[^a-zA-Z0-9]", "_", col) for col in X.columns]
y = df["SalarioAnual"]

**Paso 5: División en train/test**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

**Paso 6: Definición del grid de hiperparámetros**

In [None]:
param_grid = {
    "iterations": [800],
    "learning_rate": [0.03],
    "depth": [6],
    "l2_leaf_reg": [3],
    "bagging_temperature": [0.2]
}

**Paso 7: GridSearchCV y entrenamiento de CatBoost**

In [None]:
model = CatBoostRegressor(verbose=0, random_state=42)

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="r2",
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train, cat_features=["TamanoEmpresa"])

print("Mejor configuración encontrada:")
print(grid.best_params_)

**Paso 8: Evaluación contra test**

In [None]:
mejor_modelo = grid.best_estimator_
y_pred = mejor_modelo.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"R² en test con mejores hiperparámetros: {r2:.4f}")

**Paso 9: Guardado de modelo**

In [None]:
columnas_modelo = X.columns.tolist()
joblib.dump((mejor_modelo, columnas_modelo), "model.pkl")