In [1]:
# IMPORTACIONES
import warnings  
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EXTRACCIÓN DE DATOS 
train = pd.read_csv('data/salaries_data.csv')
test = pd.read_csv('data/testeo.csv')

# LIMPIEZA BÁSICA
train['sal'] = train.salary_in_usd
train = train.drop(['salary', 'salary_currency', 'salary_in_usd'], axis=1)

# TRANSFORMACION DE LOS DATOS

In [2]:
train.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size,sal
0,2022,SE,FT,Data Engineer,US,100,US,M,140250
1,2022,SE,FT,Data Engineer,US,100,US,M,135000
2,2021,MI,FT,BI Data Analyst,US,100,US,M,100000
3,2021,MI,CT,ML Engineer,US,100,US,L,270000
4,2021,MI,FT,Data Engineer,RO,0,US,L,26005


In [3]:
train2 = train.copy()
test2 = test.copy()

### TARGET ENCODING

In [4]:
from category_encoders import TargetEncoder
test2['sal'] = 0

encoder_loc = TargetEncoder(cols=['company_location'], handle_unknown='ignore')
train2_encoded = encoder_loc.fit_transform(train2[['sal', 'company_size', 'company_location']], train2['sal'])
train2['company_location'] = train2_encoded.company_location
#Test
test2_encoded = encoder_loc.transform(test2[['sal', 'company_size', 'company_location']])
test2['company_location'] = test2_encoded['company_location']

encoder_job = TargetEncoder(cols=['job_title'], handle_unknown='ignore')
train_encoded = encoder_job.fit_transform(train2, train2.sal)
train2['job_title'] = train_encoded.job_title
#Test
test_encoded = encoder_job.transform(test2)
test2['job_title'] = test_encoded.job_title

encoder_res = TargetEncoder(cols=['employee_residence'], handle_unknown='ignore')
train3 = encoder_res.fit_transform(train2[['sal','employee_residence','company_size', 'company_location']], train2['sal'])
train2['employee_residence'] = train3.employee_residence
#Test
test3 = encoder_res.transform(test2[['sal','employee_residence','company_size', 'company_location']])
test2['employee_residence'] = test3.employee_residence

In [5]:
train2 = train.drop(['remote_ratio', 'employment_type', 'work_year','company_size'], axis=1)
test2 = test.drop(['remote_ratio', 'employment_type', 'work_year','company_size'], axis=1)

### ONE HOT ENCODING

In [6]:
# Transf TRAIN
train2 = pd.get_dummies(train2, drop_first=False)
# Transf TEST
test2 = pd.get_dummies(test2, drop_first=False)

In [7]:
'''# Transf TRAIN
train2 = pd.get_dummies(train2, columns=['employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location'], drop_first=False)
# Transf TEST
test2 = pd.get_dummies(test2, columns=['employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location'], drop_first=False)
'''

"# Transf TRAIN\ntrain2 = pd.get_dummies(train2, columns=['employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location'], drop_first=False)\n# Transf TEST\ntest2 = pd.get_dummies(test2, columns=['employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location'], drop_first=False)\n"

In [8]:
train2.head()

Unnamed: 0,sal,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,job_title_3D Computer Vision Researcher,job_title_AI Scientist,job_title_Analytics Engineer,job_title_Applied Data Scientist,job_title_Applied Machine Learning Scientist,...,company_location_PK,company_location_PL,company_location_PT,company_location_RO,company_location_SG,company_location_SI,company_location_TR,company_location_UA,company_location_US,company_location_VN
0,140250,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,135000,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,100000,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,270000,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,26005,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
test2.head()

Unnamed: 0,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,job_title_AI Scientist,job_title_Analytics Engineer,job_title_Applied Machine Learning Scientist,job_title_BI Data Analyst,job_title_Big Data Engineer,job_title_Business Data Analyst,...,company_location_IL,company_location_IN,company_location_IQ,company_location_IT,company_location_JP,company_location_MX,company_location_MY,company_location_PL,company_location_RU,company_location_US
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [10]:
len(train2.columns), len(test2.columns)

(145, 83)

# MACHINE LEARNING (REGRESORES)

In [11]:
import time
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error

In [12]:
X = train2.drop('sal', axis=1)
y = train2.sal

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

param_RFR = {'n_estimators': [10, 50, 100, 150, 200, 500], 'max_depth': [1, 5, 10, 15, 20]}
param_ETR = {'n_estimators': [10, 50, 100, 150, 200, 500], 'max_depth': [1, 5, 10, 15, 20]}
param_EN = {'alpha': [0.1, 1.0, 5.0, 10.0], 'l1_ratio': [0.25, 0.5, 0.75]}
param_BR = {'alpha_1': [1e-6, 1e-5, 1e-4], 'alpha_2': [1e-6, 1e-5, 1e-4], 'lambda_1': [1e-6, 1e-5, 1e-4], 'lambda_2': [1e-6, 1e-5, 1e-4]}
param_Ridge = {'alpha': [0.1, 1.0, 5.0, 10.0]}
param_Lasso = {'alpha': [0.1, 1.0, 5.0, 10.0]}
param_SVR = {'C': [0.1, 1.0, 5.0, 10.0], 'gamma': ['scale', 'auto']}
param_KNN = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}
param_MLP = {'hidden_layer_sizes': [(100,), (50, 100, 50), (100, 100, 100)], 'alpha': [0.0001, 0.001, 0.01]}
param_DT = {'max_depth': [1, 5, 10, 15, 20]}
param_AB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}
param_GB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}
param_XGB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}

models = [RandomForestRegressor(),
          ExtraTreesRegressor(),
          ElasticNet(),
          BayesianRidge(),
          Ridge(),
          Lasso(),
          SVR(),
          KNeighborsRegressor(),
          MLPRegressor(),
          DecisionTreeRegressor(),
          AdaBoostRegressor(),
          GradientBoostingRegressor(),
          XGBRegressor()]

params = [param_RFR,
          param_ETR,
          param_EN,
          param_BR,
          param_Ridge,
          param_Lasso,
          param_SVR,
          param_KNN,
          param_MLP,
          param_DT,
          param_AB,
          param_GB,
          param_XGB]

top_models = []
for model, param in zip(models, params):
    grid_search = GridSearchCV(estimator=model, param_grid=param, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)  
    top_models.append((model.__class__.__name__, grid_search.best_score_, grid_search.best_params_))


top_models = sorted(top_models, key=lambda x: x[1], reverse=True)[:5]

for model_name, best_score, best_params in top_models:
    # Buscamos el índice del modelo en la lista de modelos
    model_index = [model.__class__.__name__ for model in models].index(model_name)
    # Obtenemos el modelo y lo configuramos con los mejores parámetros
    model = models[model_index]
    model.set_params(**best_params)
    # Ajustamos el modelo y obtenemos las estadísticas
    model.fit(X_train, y_train)
    test_score = model.score(X_test, y_test)
    train_score = model.score(X_train, y_train)
    # Imprimimos las estadísticas del modelo
    print(f"Modelo: {model_name}")
    print(f"Mejores parámetros: {best_params}")
    print(f"Acierto CV: {best_score:.2f}")
    print(f"Acierto en entrenamiento: {train_score:.2f}")
    print(f"Acierto en prueba: {test_score:.2f}")
    print("="*50)

Modelo: Ridge
Mejores parámetros: {'alpha': 5.0}
Acierto CV: 0.52
Acierto en entrenamiento: 0.64
Acierto en prueba: 0.40
Modelo: ElasticNet
Mejores parámetros: {'alpha': 0.1, 'l1_ratio': 0.75}
Acierto CV: 0.51
Acierto en entrenamiento: 0.60
Acierto en prueba: 0.42
Modelo: GradientBoostingRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 150}
Acierto CV: 0.51
Acierto en entrenamiento: 0.83
Acierto en prueba: 0.34
Modelo: BayesianRidge
Mejores parámetros: {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 1e-05, 'lambda_2': 0.0001}
Acierto CV: 0.50
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.37
Modelo: XGBRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 50}
Acierto CV: 0.50
Acierto en entrenamiento: 0.81
Acierto en prueba: 0.39


# EVALUACION RMSE

In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

for model_name, best_score, best_params in top_models:
    # Buscamos el índice del modelo en la lista de modelos
    model_index = [model.__class__.__name__ for model in models].index(model_name)
    # Obtenemos el modelo y lo configuramos con los mejores parámetros
    model = models[model_index]
    model.set_params(**best_params)
    # Ajustamos el modelo y obtenemos las estadísticas
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculamos el RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # Imprimimos las estadísticas del modelo
    print(f"Modelo: {model_name}")
    print(f"Mejores parámetros: {best_params}")
    print(f"Acierto CV: {best_score:.2f}")
    print(f"Acierto en entrenamiento: {train_score:.2f}")
    print(f"Acierto en prueba: {test_score:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("="*50)

Modelo: Ridge
Mejores parámetros: {'alpha': 5.0}
Acierto CV: 0.52
Acierto en entrenamiento: 0.81
Acierto en prueba: 0.39
RMSE: 48328.63
Modelo: ElasticNet
Mejores parámetros: {'alpha': 0.1, 'l1_ratio': 0.75}
Acierto CV: 0.51
Acierto en entrenamiento: 0.81
Acierto en prueba: 0.39
RMSE: 47649.53
Modelo: GradientBoostingRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 150}
Acierto CV: 0.51
Acierto en entrenamiento: 0.81
Acierto en prueba: 0.39
RMSE: 50797.93
Modelo: BayesianRidge
Mejores parámetros: {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 1e-05, 'lambda_2': 0.0001}
Acierto CV: 0.50
Acierto en entrenamiento: 0.81
Acierto en prueba: 0.39
RMSE: 49489.07
Modelo: XGBRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 50}
Acierto CV: 0.50
Acierto en entrenamiento: 0.81
Acierto en prueba: 0.39
RMSE: 49032.23
