In [1]:
# IMPORTACIONES
import warnings  
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EXTRACCIÓN DE DATOS 
train = pd.read_csv('data/salaries_data.csv')
test = pd.read_csv('data/testeo.csv')

# LIMPIEZA BÁSICA
train['sal'] = train.salary_in_usd
train = train.drop(['salary', 'salary_currency', 'salary_in_usd'], axis=1)

In [4]:
train.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size,sal
0,2022,SE,FT,Data Engineer,US,100,US,M,140250
1,2022,SE,FT,Data Engineer,US,100,US,M,135000
2,2021,MI,FT,BI Data Analyst,US,100,US,M,100000
3,2021,MI,CT,ML Engineer,US,100,US,L,270000
4,2021,MI,FT,Data Engineer,RO,0,US,L,26005


In [None]:
train2 = train[['work_year','company_size','job_title', 'experience_level', 'company_location', 'employee_residence','sal']]
test2 = test[['work_year','company_size','job_title', 'experience_level', 'company_location','employee_residence']]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder_year = OrdinalEncoder()
train2['work_year'] = encoder_year.fit_transform(train2[['work_year']])

encoder_size = OrdinalEncoder(categories=[['S', 'M', 'L']])
train2['company_size'] = encoder_size.fit_transform(train2[['company_size']])

encoder_exp = OrdinalEncoder(categories=[['EN', 'MI', 'SE', 'EX']])
train2['experience_level'] = encoder_exp.fit_transform(train2[['experience_level']])

# Test Ordinal
test2['work_year'] = encoder_year.transform(test2[['work_year']])
test2['company_size'] = encoder_size.transform(test2[['company_size']])
test2['experience_level'] = encoder_exp.transform(test2[['experience_level']])

In [None]:
# Transf TRAIN
train2 = pd.get_dummies(train, drop_first=False)
# Transf TEST
test2 = pd.get_dummies(test, drop_first=False)

# MACHINE LEARNING (REGRESORES)

In [None]:
import time
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error

In [None]:
X = train2.drop('sal', axis=1)
y = train2.sal

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

param_RFR = {'n_estimators': [10, 50, 100, 150, 200, 500], 'max_depth': [1, 5, 10, 15, 20]}
param_ETR = {'n_estimators': [10, 50, 100, 150, 200, 500], 'max_depth': [1, 5, 10, 15, 20]}
param_EN = {'alpha': [0.1, 1.0, 5.0, 10.0], 'l1_ratio': [0.25, 0.5, 0.75]}
param_BR = {'alpha_1': [1e-6, 1e-5, 1e-4], 'alpha_2': [1e-6, 1e-5, 1e-4], 'lambda_1': [1e-6, 1e-5, 1e-4], 'lambda_2': [1e-6, 1e-5, 1e-4]}
param_Ridge = {'alpha': [0.1, 1.0, 5.0, 10.0]}
param_Lasso = {'alpha': [0.1, 1.0, 5.0, 10.0]}
param_SVR = {'C': [0.1, 1.0, 5.0, 10.0], 'gamma': ['scale', 'auto']}
param_KNN = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}
param_MLP = {'hidden_layer_sizes': [(100,), (50, 100, 50), (100, 100, 100)], 'alpha': [0.0001, 0.001, 0.01]}
param_DT = {'max_depth': [1, 5, 10, 15, 20]}
param_AB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}
param_GB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}
param_XGB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}

models = [RandomForestRegressor(),
          ExtraTreesRegressor(),
          ElasticNet(),
          BayesianRidge(),
          Ridge(),
          Lasso(),
          SVR(),
          KNeighborsRegressor(),
          MLPRegressor(),
          DecisionTreeRegressor(),
          AdaBoostRegressor(),
          GradientBoostingRegressor(),
          XGBRegressor()]

params = [param_RFR,
          param_ETR,
          param_EN,
          param_BR,
          param_Ridge,
          param_Lasso,
          param_SVR,
          param_KNN,
          param_MLP,
          param_DT,
          param_AB,
          param_GB,
          param_XGB]

top_models = []
for model, param in zip(models, params):
    grid_search = GridSearchCV(estimator=model, param_grid=param, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)  
    top_models.append((model.__class__.__name__, grid_search.best_score_, grid_search.best_params_))


top_models = sorted(top_models, key=lambda x: x[1], reverse=True)[:5]

for model_name, best_score, best_params in top_models:
    # Buscamos el índice del modelo en la lista de modelos
    model_index = [model.__class__.__name__ for model in models].index(model_name)
    # Obtenemos el modelo y lo configuramos con los mejores parámetros
    model = models[model_index]
    model.set_params(**best_params)
    # Ajustamos el modelo y obtenemos las estadísticas
    model.fit(X_train, y_train)
    test_score = model.score(X_test, y_test)
    train_score = model.score(X_train, y_train)
    # Imprimimos las estadísticas del modelo
    print(f"Modelo: {model_name}")
    print(f"Mejores parámetros: {best_params}")
    print(f"Acierto CV: {best_score:.2f}")
    print(f"Acierto en entrenamiento: {train_score:.2f}")
    print(f"Acierto en prueba: {test_score:.2f}")
    print("="*50)

# EVALUACION RMSE

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

for model_name, best_score, best_params in top_models:
    # Buscamos el índice del modelo en la lista de modelos
    model_index = [model.__class__.__name__ for model in models].index(model_name)
    # Obtenemos el modelo y lo configuramos con los mejores parámetros
    model = models[model_index]
    model.set_params(**best_params)
    # Ajustamos el modelo y obtenemos las estadísticas
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculamos el RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # Imprimimos las estadísticas del modelo
    print(f"Modelo: {model_name}")
    print(f"Mejores parámetros: {best_params}")
    print(f"Acierto CV: {best_score:.2f}")
    print(f"Acierto en entrenamiento: {train_score:.2f}")
    print(f"Acierto en prueba: {test_score:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("="*50)