# ⚖️ Comparativa de Modelos de Regresión para Predicción de Salario IT

Este notebook compara el rendimiento de distintos modelos de regresión usando el mismo preprocesamiento, datos y métricas:

Modelos comparados:
- LightGBM
- XGBoost
- CatBoost
- Ridge
- Random Forest

Métrica: Coeficiente de determinación R²


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("./data/survey_results_public.csv", low_memory=False)

columnas = {
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
}
df = df[list(columnas.keys())].rename(columns=columnas)

df = df[df["Salario_anual"].between(10000, 300000)]
df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")

top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")

roles = df["Rol"].str.get_dummies(sep=";")
df = pd.concat([df.drop(columns=["Rol"]), roles], axis=1)
df = df.dropna()


In [3]:
df["Salario_anual_log"] = np.log1p(df["Salario_anual"])
df["Anios_experiencia_log"] = np.log1p(df["Anios_experiencia"])
df["Anios_experiencia_cuadrado"] = df["Anios_experiencia"] ** 2

orden_empresa = [
    "Just me - I am a freelancer, sole proprietor, etc.",
    "2 to 9 employees",
    "10 to 19 employees",
    "20 to 99 employees",
    "100 to 499 employees",
    "500 to 999 employees",
    "1,000 to 4,999 employees",
    "5,000 to 9,999 employees",
    "10,000 or more employees"
]
df = df[df["Tamano_empresa"].isin(orden_empresa)]
df["Tamano_empresa_ordinal"] = df["Tamano_empresa"].apply(lambda x: orden_empresa.index(x))


In [4]:
X = df.drop(columns=["Salario_anual", "Salario_anual_log", "Tamano_empresa"])
y = df["Salario_anual_log"]

cat_cols = ["Tipo_empleo", "Trabajo_remoto", "Pais", "Nivel_educativo"]
num_cols = [col for col in X.columns if col not in cat_cols]

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
modelos = {
    "LightGBM": LGBMRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "Ridge": Ridge(),
    "RandomForest": RandomForestRegressor(random_state=42)
}

resultados = {}

for nombre, modelo in modelos.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", modelo)
    ])
    print(f"Entrenando {nombre}...")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = r2_score(y_test, y_pred)
    resultados[nombre] = score
    print(f"R² en test ({nombre}): {score:.4f}")


Entrenando LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 16229, number of used features: 79
[LightGBM] [Info] Start training from score 11.112555
R² en test (LightGBM): 0.6201
Entrenando XGBoost...
R² en test (XGBoost): 0.6045
Entrenando CatBoost...
R² en test (CatBoost): 0.6218
Entrenando Ridge...
R² en test (Ridge): 0.5942
Entrenando RandomForest...
R² en test (RandomForest): 0.5424
