# 🚀 Mejora del modelo de predicción de salario en IT

Aplicamos varias estrategias para mejorar el modelo de predicción de salario en el sector tecnológico:
1. Filtrado de salarios extremos
2. Logaritmo del salario
3. Expansión de columnas multietiqueta
4. Agrupación de países
5. LightGBM con RandomizedSearchCV


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")


In [4]:
df = pd.read_csv("./data/survey_results_public.csv")
columnas = {
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
}
df = df[list(columnas.keys())].rename(columns=columnas)


In [5]:
df = df[df["Salario_anual"].between(10000, 300000)]
df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")


In [6]:
top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")


In [7]:
roles = df["Rol"].str.get_dummies(sep=";")
df = pd.concat([df.drop(columns=["Rol"]), roles], axis=1)
df = df.dropna()


In [8]:
df["Salario_anual_log"] = np.log1p(df["Salario_anual"])


In [9]:
X = df.drop(columns=["Salario_anual", "Salario_anual_log"])
y = df["Salario_anual_log"]

cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include="number").columns.tolist()


In [10]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [12]:
param_dist = {
    "regressor__n_estimators": [100, 200, 300],
    "regressor__max_depth": [5, 10, 15, -1],
    "regressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "regressor__num_leaves": [20, 31, 50, 100],
    "regressor__min_child_samples": [10, 20, 30]
}

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LGBMRegressor(random_state=42))
])

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    scoring="r2",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
print("Mejores parámetros encontrados:", search.best_params_)
print("Mejor R² en validación cruzada:", search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 16446, number of used features: 86
[LightGBM] [Info] Start training from score 11.115059
Mejores parámetros encontrados: {'regressor__num_leaves': 100, 'regressor__n_estimators': 300, 'regressor__min_child_samples': 20, 'regressor__max_depth': 5, 'regressor__learning_rate': 0.05}
Mejor R² en validación cruzada: 0.5842103647580003


In [13]:
mejor_modelo = search.best_estimator_
y_pred = mejor_modelo.predict(X_test)
print("R² en test:", r2_score(y_test, y_pred))


R² en test: 0.6029019249504732
