# 💼 Proyecto de Machine Learning - Predicción de salario en IT (Modelos avanzados)

Este proyecto tiene como objetivo aplicar técnicas de Machine Learning para predecir el salario anual (`Salario_anual`) de profesionales tecnológicos a partir de sus características personales y laborales.

Se utilizan modelos avanzados como **Random Forest**, **CatBoost** y **LightGBM**, integrados en pipelines con preprocesamiento completo.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor


In [2]:
df = pd.read_csv("./data/survey_results_public.csv")

columnas_seleccionadas = {
    "Employment": "Tipo_empleo",
    "RemoteWork": "Trabajo_remoto",
    "DevType": "Rol",
    "EdLevel": "Nivel_educativo",
    "YearsCodePro": "Anios_experiencia",
    "Country": "Pais",
    "OrgSize": "Tamano_empresa",
    "ConvertedCompYearly": "Salario_anual"
}

df = df[list(columnas_seleccionadas.keys())].rename(columns=columnas_seleccionadas)


In [3]:
df = df[df["Salario_anual"].notna()].copy()
df["Anios_experiencia"] = df["Anios_experiencia"].replace({
    "Less than 1 year": "0",
    "More than 50 years": "51"
})
df["Anios_experiencia"] = pd.to_numeric(df["Anios_experiencia"], errors="coerce")
df = df.drop(columns=["Rol"])
df = df.dropna()
df.shape


(23319, 7)

In [4]:
X = df.drop("Salario_anual", axis=1)
y = df["Salario_anual"]

cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include="number").columns.tolist()


In [5]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])


In [6]:
modelos = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "LightGBM": LGBMRegressor(random_state=42)
}

resultados = {}
for nombre, modelo in modelos.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", modelo)
    ])
    scores = cross_val_score(pipe, X, y, cv=5, scoring="r2")
    resultados[nombre] = scores
    print(f"{nombre} - R2 promedio: {scores.mean():.4f}")


KeyboardInterrupt: 