# 🧠 Optimización Extrema del Modelo de Predicción de Salario en IT

Este notebook incluye todas las mejoras aplicables para maximizar el rendimiento del modelo:

- Refinamiento de hiperparámetros (GridSearchCV)
- Ingeniería de características avanzadas
- Conversión de variables ordinales
- Reducción de dimensionalidad
- Modelado con LightGBM optimizado


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")


In [4]:
# Cargar el dataset
df = pd.read_csv("./data/survey_results_public.csv", low_memory=False)

# Renombrar columnas clave
columnas = {
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
}
df = df[list(columnas.keys())].rename(columns=columnas)

# Filtrar salarios extremos
df = df[df["Salario_anual"].between(10000, 300000)]

# Convertir experiencia a numérico
df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0", "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")

# Agrupar países poco frecuentes
top_paises = df["Pais"].value_counts().head(15).index
df["Pais"] = df["Pais"].apply(lambda x: x if x in top_paises else "Otro")

# Expandir columna multietiqueta 'Rol'
roles = df["Rol"].str.get_dummies(sep=";")
df = pd.concat([df.drop(columns=["Rol"]), roles], axis=1)
df = df.dropna()


In [13]:
roles

Unnamed: 0,Academic researcher,Blockchain,Cloud infrastructure engineer,Data engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,Developer Advocate,...,Marketing or sales professional,Other (please specify):,Product manager,Project manager,Research & Development role,Scientist,Security professional,"Senior Executive (C-Suite, VP, etc.)",Student,System administrator
374,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
379,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
389,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41184,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
41185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Transformar salario y generar nuevas variables
df["Salario_anual_log"] = np.log1p(df["Salario_anual"])
df["Anios_experiencia_log"] = np.log1p(df["Anios_experiencia"])
df["Anios_experiencia_cuadrado"] = df["Anios_experiencia"] ** 2


In [6]:
# Ordenar educación y tamaño de empresa
orden_educacion = [
    "Primary/elementary school",
    "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
    "Some college/university study without earning a degree",
    "Associate degree (A.A., A.S., etc.)",
    "Bachelor’s degree (B.A., B.S., B.Eng., etc.)",
    "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",
    "Professional degree (JD, MD, etc.)",
    "Other doctoral degree (Ph.D., Ed.D., etc.)"
]
orden_empresa = [
    "Just me - I am a freelancer, sole proprietor, etc.",
    "2 to 9 employees",
    "10 to 19 employees",
    "20 to 99 employees",
    "100 to 499 employees",
    "500 to 999 employees",
    "1,000 to 4,999 employees",
    "5,000 to 9,999 employees",
    "10,000 or more employees"
]

df = df[df["Nivel_educativo"].isin(orden_educacion)]
df = df[df["Tamano_empresa"].isin(orden_empresa)]


In [7]:
X = df.drop(columns=["Salario_anual", "Salario_anual_log"])
y = df["Salario_anual_log"]

cat_cols = ["Tipo_empleo", "Trabajo_remoto", "Pais"]
ord_cols = ["Nivel_educativo", "Tamano_empresa"]
num_cols = ["Anios_experiencia", "Anios_experiencia_log", "Anios_experiencia_cuadrado"]
onehot_cols = cat_cols
ordinal_cols = ord_cols
numericas = num_cols + [col for col in X.columns if col not in cat_cols + ord_cols and X[col].dtype != "object"]


In [8]:
ordinal_pipeline = Pipeline([
    ("ordinal", OrdinalEncoder(categories=[orden_educacion, orden_empresa]))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("ordinal", ordinal_pipeline, ordinal_cols),
    ("categorical", cat_pipeline, onehot_cols),
    ("numeric", num_pipeline, numericas)
])


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LGBMRegressor(random_state=42))
])

param_grid = {
    "regressor__n_estimators": [100, 200],
    "regressor__max_depth": [10, 15],
    "regressor__learning_rate": [0.05, 0.1],
    "regressor__num_leaves": [31, 50],
    "regressor__min_child_samples": [20, 30]
}

search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="r2",
    cv=3,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)
print("Mejores parámetros:", search.best_params_)
print("Mejor R² en validación cruzada:", search.best_score_)


Fitting 3 folds for each of 32 candidates, totalling 96 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 488
[LightGBM] [Info] Number of data points in the train set: 15258, number of used features: 75
[LightGBM] [Info] Start training from score 11.113551
Mejores parámetros: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 10, 'regressor__min_child_samples': 20, 'regressor__n_estimators': 200, 'regressor__num_leaves': 31}
Mejor R² en validación cruzada: 0.5934296716747887


In [10]:
# Evaluar en test
mejor_modelo = search.best_estimator_
y_pred = mejor_modelo.predict(X_test)
print("R² en test:", r2_score(y_test, y_pred))


R² en test: 0.6079110294378955
