In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.plotting.backend = "plotly"
sns.set(rc={'figure.figsize':(8,8)});

import umap
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.tree import ExtraTreeRegressor, plot_tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from pycaret.regression import *
import h2o
from h2o.automl import H2OAutoML

In [2]:
train = pd.read_csv('data/salaries_data.csv')
test = pd.read_csv('data/test.csv')

In [3]:
def transform_data(train: pd.DataFrame , test: pd.DataFrame, target: str):
    
    exp_dict = {'EN': 1, 'MI':2, 'SE': 3, 'EX': 4}
    
    company_size_dict = {'S':0, 'M':1,'L':2}
    
    jobs_dict = dict(zip(set(train.job_title.unique().tolist()+test.job_title.unique().tolist()), 
                         [i for i in range(len(set(train.job_title.unique().tolist()+test.job_title.unique().tolist())))]))
    
    salary_currency_dict = dict(zip(set(train.salary_currency.unique().tolist() + test.salary_currency.unique().tolist()), 
                                    [i for i in range(len(set(train.salary_currency.unique().tolist() + test.salary_currency.unique().tolist())))]))
    
    company_location_dict = dict(zip(set(train.company_location.unique().tolist()+test.company_location.unique().tolist()), 
                                     [i for i in range(len(set(train.company_location.unique().tolist()+test.company_location.unique().tolist())))]))
    
    employee_residence_dict = dict(zip(set(train.employee_residence.unique().tolist()+test.employee_residence.unique().tolist()), 
                                     [i for i in range(len(set(train.employee_residence.unique().tolist()+test.employee_residence.unique().tolist())))]))
    
    employment_type_dict = dict(zip(set(train.employment_type.unique().tolist()+test.employment_type.unique().tolist()), 
                                     [i for i in range(len(set(train.employment_type.unique().tolist()+test.employment_type.unique().tolist())))]))
    
    train.experience_level = train.experience_level.apply(lambda x: exp_dict[x])
    
    test.experience_level = test.experience_level.apply(lambda x: exp_dict[x])
    
    train.job_title = train.job_title.apply(lambda x: jobs_dict[x])
    
    test.job_title = test.job_title.apply(lambda x: jobs_dict[x])
    
    train.salary_currency = train.salary_currency.apply(lambda x: salary_currency_dict[x])
    
    test.salary_currency = test.salary_currency.apply(lambda x: salary_currency_dict[x])
    
    train.company_location = train.company_location.apply(lambda x: company_location_dict[x])
    
    test.company_location = test.company_location.apply(lambda x: company_location_dict[x])
    
    train.company_size = train.company_size.apply(lambda x: company_size_dict[x])
    
    test.company_size = test.company_size.apply(lambda x: company_size_dict[x])
    
    train.employee_residence = train.employee_residence.apply(lambda x: employee_residence_dict[x])
    
    test.employee_residence = test.employee_residence.apply(lambda x: employee_residence_dict[x])
    
    train.employment_type = train.employment_type.apply(lambda x: employment_type_dict[x])
    
    test.employment_type = test.employment_type.apply(lambda x: employment_type_dict[x])
    
    X = train.drop(target, axis=1)
    y = train[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=.2)
    
    scaler = MinMaxScaler().fit(X_train)
    
    X_s = scaler.transform(X)
    
    X_train_s = scaler.transform(X_train)
    
    X_test_s = scaler.transform(X_test)
    
    test_s = scaler.transform(test)
    
    return X_s, y, X_train, X_test, y_train, y_test, test_s
    

In [4]:
X, y, X_train, X_test, y_train, y_test, new_test = transform_data(train,test,'salary_in_usd')

In [5]:
from sklearn.metrics import mean_squared_error

In [6]:
rf = RandomForestRegressor()
xgb = XGBRegressor()
xgbr = XGBRFRegressor()
linreg = LinearRegression()
trees = ExtraTreeRegressor()
knn = KNeighborsRegressor()
gb = GradientBoostingRegressor()
cat = CatBoostRegressor(verbose=0)
lgbm = LGBMRegressor()

models = [rf, xgb, xgbr, linreg, trees, knn, gb, cat]

In [52]:
train_preds = pd.DataFrame()

test_preds = pd.DataFrame()

eval_train = pd.DataFrame()

eval_test = pd.DataFrame()

new_test_preds = pd.DataFrame()

total_preds = pd.DataFrame()

for model in models:
    name = str(model)[:14]
    print(f'\nEntrenando ..... {name}\n')
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    rmse_train = mean_squared_error(y_train, train_pred, squared=False)
    test_pred = model.predict(X_test)
    rmse_test = mean_squared_error(y_test, test_pred, squared=False)
    
    model.fit(X,y)
    
    new_test_pred = model.predict(new_test)
    
    total_pred = model.predict(X)
    
    train_preds[f'{name}'] = train_pred
    test_preds[f'{name}'] = test_pred
    new_test_preds[f'{name}'] = new_test_pred
    total_preds[f'{name}'] = total_pred
    
    eval_train[f'{name}'] = rmse_train
    eval_test[f'{name}'] = rmse_test
    
    print(f'RMSE de {name} en train: {rmse_train}')
    print(f'RMSE de {name} en test: {rmse_test}')


Entrenando ..... RandomForestRe

RMSE de RandomForestRe en train: 7158.798262398673
RMSE de RandomForestRe en test: 18689.153821715605

Entrenando ..... XGBRegressor(b

RMSE de XGBRegressor(b en train: 114.57111107261204
RMSE de XGBRegressor(b en test: 11892.517307971115

Entrenando ..... XGBRFRegressor

RMSE de XGBRFRegressor en train: 5836.402507769662
RMSE de XGBRFRegressor en test: 14105.360726360554

Entrenando ..... LinearRegressi

RMSE de LinearRegressi en train: 59123.95515459107
RMSE de LinearRegressi en test: 58471.04485762561

Entrenando ..... ExtraTreeRegre

RMSE de ExtraTreeRegre en train: 0.0
RMSE de ExtraTreeRegre en test: 25966.446095875348

Entrenando ..... KNeighborsRegr

RMSE de KNeighborsRegr en train: 34206.63674383379
RMSE de KNeighborsRegr en test: 32665.76598766972

Entrenando ..... GradientBoosti

RMSE de GradientBoosti en train: 4047.5915087901753
RMSE de GradientBoosti en test: 11400.992301732145

Entrenando ..... <catboost.core

RMSE de <catboost.core en tr

In [8]:
train_preds

Unnamed: 0,RandomForestRe,XGBRegressor(b,XGBRFRegressor,LinearRegressi,ExtraTreeRegre,KNeighborsRegr,GradientBoosti,<catboost.core
0,59659.41,60749.695312,58354.585938,116335.966454,60757.0,61266.2,58999.164228,59966.284565
1,222146.97,224993.000000,217802.015625,158699.195989,225000.0,222044.0,228849.888404,223056.294462
2,109347.20,109360.890625,106462.859375,147674.360546,109280.0,93111.2,105243.099897,109560.595511
3,105120.20,105064.515625,102029.625000,136708.503468,105000.0,104978.0,101941.577837,102905.596113
4,135469.84,135046.609375,139068.375000,123994.462612,135000.0,135000.0,136008.175271,134958.578832
...,...,...,...,...,...,...,...,...
395,119680.20,120439.015625,116873.898438,136570.773983,120600.0,108859.6,116640.984270,121310.177631
396,165331.84,165271.328125,162988.484375,146889.317962,165400.0,165160.0,166214.793964,166506.404795
397,134989.40,135122.015625,138776.234375,128331.192721,135000.0,135000.0,136008.175271,134972.884777
398,21768.28,18500.212891,22268.992188,80092.655354,18442.0,18929.0,27352.491440,20987.360089


In [53]:
lgbm.fit(total_preds, y)

In [10]:
mean_squared_error(y_train, lgbm.predict(train_preds), squared=False)

15517.441922229418

In [11]:
mean_squared_error(y_test, lgbm.predict(test_preds), squared=False)

19782.810598126038

In [54]:
pred_new_test = lgbm.predict(new_test_preds)

In [55]:
len(pred_new_test)

107

In [56]:
pred_new_test

array([277424.8560271 ,  89554.48398146,  88714.33866574,  89817.36971339,
        11020.41199916, 266070.53256425,  62646.76113147,  48767.03480593,
        38751.95074389, 203258.75049509,  84958.18198141, 109012.24637539,
       130769.83534233,  13891.6412404 ,  49476.04968758,  45426.55461597,
        78247.73523982, 110461.33823754,  68335.65939318,  68494.98812077,
        68880.53229385,  53554.13667434, 117153.87893482, 179907.85960535,
        12108.14536306,  65461.91370309,  99434.31024497, 204465.53698498,
        68129.57334273, 212500.50198427,  90237.23621536,  84449.4435868 ,
        64344.45431882,  70227.50313028,  16508.65077299,  61726.68447344,
       178137.08951199,  56351.61415398,  17730.96675666, 189801.7724963 ,
        72200.05099949,  24890.62577775,  86074.06228187, 164031.41597873,
        89779.23770621, 320077.71428993, 115702.42305459,  63979.01131541,
        72847.4705778 ,  66836.64742652,  44375.01406776,  40129.64587282,
       131897.70280111, 1

In [57]:
ids = [i for i in range(107)]

In [58]:
preds = pd.DataFrame()

In [61]:
preds['id'] = ids
preds['salary_in_usd'] = pred_new_test


Unnamed: 0,id,preds,salary_in_usd
0,0,277424.856027,277424.856027
1,1,89554.483981,89554.483981
2,2,88714.338666,88714.338666
3,3,89817.369713,89817.369713
4,4,11020.411999,11020.411999
...,...,...,...
102,102,169861.446279,169861.446279
103,103,184437.978434,184437.978434
104,104,220093.850379,220093.850379
105,105,59746.283838,59746.283838


In [64]:
preds = preds.drop('preds', axis=1)

In [65]:
preds

Unnamed: 0,id,salary_in_usd
0,0,277424.856027
1,1,89554.483981
2,2,88714.338666
3,3,89817.369713
4,4,11020.411999
...,...,...
102,102,169861.446279
103,103,184437.978434
104,104,220093.850379
105,105,59746.283838


In [67]:
preds.to_csv('prueba_preds_stacked_model.csv', index=False)