# IMPORTACIONES

In [1]:
import warnings  
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EXTRACCIÓN DE DATOS 

In [2]:
train = pd.read_csv('data/salaries_data.csv')
test = pd.read_csv('data/testeo.csv')

# LIMPIEZA BÁSICA

In [3]:
train['sal'] = train.salary_in_usd
train = train.drop(['salary', 'salary_currency', 'salary_in_usd'], axis=1)

# TRANSFORMACION DE LOS DATOS

### TRAIN VS TEST

In [4]:
tr = train.drop('sal', axis=1)

lst_year = [e for e in test.work_year if e not in tr.work_year.unique()]
lst_experience_level = [e for e in test.experience_level if e not in tr.experience_level.unique()]
lst_employment_type = [e for e in test.employment_type if e not in tr.employment_type.unique()]
lst_job = [e for e in test.job_title if e not in tr.job_title.unique()]
lst_emp_res = [e for e in test.employee_residence if e not in tr.employee_residence.unique()]
lst_remo = [e for e in test.remote_ratio if e not in tr.remote_ratio.unique()]
lst_loc = [e for e in test.company_location if e not in tr.company_location.unique()]
lst_siz = [e for e in test.company_size if e not in tr.company_size.unique()]

lst_job, lst_emp_res, lst_loc

(['Principal Data Engineer',
  'Principal Data Engineer',
  'Principal Data Engineer',
  'NLP Engineer'],
 ['CN', 'PH', 'DK', 'HR', 'IQ', 'CO', 'DK', 'AR', 'TN'],
 ['IT', 'HR', 'IQ', 'IL', 'RU', 'RU', 'CO', 'IT', 'MY'])

In [5]:
# Transformación employee_residence
paises_res = {'CN':'IN','PH':'MY', 'DK':'CH', 'HR':'RS', 'IQ':'IN', 'CO':'CL','AR':'BR','TN':'IN'}
test['employee_residence'] = test['employee_residence'].replace(paises_res)
# Transformación company_location
paises_loc = {'IT': 'ES', 'HR': 'SI', 'IQ': 'AE', 'IL': 'ES', 'RU': 'UA', 'CO': 'MX', 'MY': 'IN'}
test['company_location'] = test['company_location'].replace(paises_loc)
#Tranformación job_title
test.loc[[23, 39, 45, 79], 'job_title'] = ['Principal Data Analyst', 'Director of Data Engineering', 
                                           'Head of Data', 'Data Scientist']

In [6]:
train2 = train.copy()
test2 = test.copy()

### TARGET ENCODING

### ORDINAL ENCODING

### DROP

In [7]:
train2 = train2[['experience_level', 'job_title', 'employee_residence', 'sal']]
test2 = test2[['experience_level', 'job_title', 'employee_residence']]

### ONE HOT

In [8]:
# Transf TRAIN
train2 = pd.get_dummies(train2, drop_first=False)
# Transf TEST
test2 = pd.get_dummies(test2, drop_first=False)

In [9]:
# Aquí pongo 'sal' al final del DF.
col = train2.pop('sal')
train2.insert(len(train2.columns), 'sal', col)

# MACHINE LEARNING (REGRESORES)

In [10]:
import time
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error

In [11]:
X = train2.drop('sal', axis=1)
y = train2.sal

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

param_RFR = {'n_estimators': [10, 50, 100, 150, 200, 500], 'max_depth': [1, 5, 10, 15, 20]}
param_ETR = {'n_estimators': [10, 50, 100, 150, 200, 500], 'max_depth': [1, 5, 10, 15, 20]}
param_EN = {'alpha': [0.1, 1.0, 5.0, 10.0], 'l1_ratio': [0.25, 0.5, 0.75]}
param_BR = {'alpha_1': [1e-6, 1e-5, 1e-4], 'alpha_2': [1e-6, 1e-5, 1e-4], 'lambda_1': [1e-6, 1e-5, 1e-4], 'lambda_2': [1e-6, 1e-5, 1e-4]}
param_Ridge = {'alpha': [0.1, 1.0, 5.0, 10.0]}
param_Lasso = {'alpha': [0.1, 1.0, 5.0, 10.0]}
param_SVR = {'C': [0.1, 1.0, 5.0, 10.0], 'gamma': ['scale', 'auto']}
param_KNN = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}
param_MLP = {'hidden_layer_sizes': [(100,), (50, 100, 50), (100, 100, 100)], 'alpha': [0.0001, 0.001, 0.01]}
param_DT = {'max_depth': [1, 5, 10, 15, 20]}
param_AB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}
param_GB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}
param_XGB = {'n_estimators': [10, 50, 100, 150, 200, 500], 'learning_rate': [0.01, 0.1, 1.0]}

models = [RandomForestRegressor(),
          ExtraTreesRegressor(),
          ElasticNet(),
          BayesianRidge(),
          Ridge(),
          Lasso(),
          SVR(),
          KNeighborsRegressor(),
          MLPRegressor(),
          DecisionTreeRegressor(),
          AdaBoostRegressor(),
          GradientBoostingRegressor(),
          XGBRegressor()]

params = [param_RFR,
          param_ETR,
          param_EN,
          param_BR,
          param_Ridge,
          param_Lasso,
          param_SVR,
          param_KNN,
          param_MLP,
          param_DT,
          param_AB,
          param_GB,
          param_XGB]

top_models = []
for model, param in zip(models, params):
    grid_search = GridSearchCV(estimator=model, param_grid=param, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)  
    top_models.append((model.__class__.__name__, grid_search.best_score_, grid_search.best_params_))


top_models = sorted(top_models, key=lambda x: x[1], reverse=True)[:5]

for model_name, best_score, best_params in top_models:
    # Buscamos el índice del modelo en la lista de modelos
    model_index = [model.__class__.__name__ for model in models].index(model_name)
    # Obtenemos el modelo y lo configuramos con los mejores parámetros
    model = models[model_index]
    model.set_params(**best_params)
    # Ajustamos el modelo y obtenemos las estadísticas
    model.fit(X_train, y_train)
    test_score = model.score(X_test, y_test)
    train_score = model.score(X_train, y_train)
    # Imprimimos las estadísticas del modelo
    print(f"Modelo: {model_name}")
    print(f"Mejores parámetros: {best_params}")
    print(f"Acierto CV: {best_score:.2f}")
    print(f"Acierto en entrenamiento: {train_score:.2f}")
    print(f"Acierto en prueba: {test_score:.2f}")
    print("="*50)

Modelo: Ridge
Mejores parámetros: {'alpha': 5.0}
Acierto CV: 0.52
Acierto en entrenamiento: 0.63
Acierto en prueba: 0.40
Modelo: ElasticNet
Mejores parámetros: {'alpha': 0.1, 'l1_ratio': 0.75}
Acierto CV: 0.52
Acierto en entrenamiento: 0.60
Acierto en prueba: 0.42
Modelo: GradientBoostingRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 100}
Acierto CV: 0.51
Acierto en entrenamiento: 0.80
Acierto en prueba: 0.33
Modelo: XGBRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 100}
Acierto CV: 0.51
Acierto en entrenamiento: 0.84
Acierto en prueba: 0.35
Modelo: BayesianRidge
Mejores parámetros: {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 1e-05, 'lambda_2': 0.0001}
Acierto CV: 0.50
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.36


# EVALUACION RMSE

In [13]:
from sklearn.metrics import mean_squared_error
import numpy as np

for model_name, best_score, best_params in top_models:
    # Buscamos el índice del modelo en la lista de modelos
    model_index = [model.__class__.__name__ for model in models].index(model_name)
    # Obtenemos el modelo y lo configuramos con los mejores parámetros
    model = models[model_index]
    model.set_params(**best_params)
    # Ajustamos el modelo y obtenemos las estadísticas
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculamos el RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # Imprimimos las estadísticas del modelo
    print(f"Modelo: {model_name}")
    print(f"Mejores parámetros: {best_params}")
    print(f"Acierto CV: {best_score:.2f}")
    print(f"Acierto en entrenamiento: {train_score:.2f}")
    print(f"Acierto en prueba: {test_score:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("="*50)

Modelo: Ridge
Mejores parámetros: {'alpha': 5.0}
Acierto CV: 0.52
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.36
RMSE: 48467.38
Modelo: ElasticNet
Mejores parámetros: {'alpha': 0.1, 'l1_ratio': 0.75}
Acierto CV: 0.52
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.36
RMSE: 47736.97
Modelo: GradientBoostingRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 100}
Acierto CV: 0.51
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.36
RMSE: 50588.25
Modelo: XGBRegressor
Mejores parámetros: {'learning_rate': 0.1, 'n_estimators': 100}
Acierto CV: 0.51
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.36
RMSE: 50416.91
Modelo: BayesianRidge
Mejores parámetros: {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 1e-05, 'lambda_2': 0.0001}
Acierto CV: 0.50
Acierto en entrenamiento: 0.69
Acierto en prueba: 0.36
RMSE: 50036.42
