In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

import warnings
warnings.simplefilter('ignore')

In [2]:
# X = pd.read_csv('data/processed/processed.csv')
# y = pd.read_csv('data/raw/train_salaries.csv')
train_df = pd.merge(left=pd.read_csv('data/processed/processed.csv'), 
                    right=pd.read_csv('data/raw/train_salaries.csv'), how='inner')
train_df.info()
# del X, y

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999995 entries, 0 to 999994
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   index                999995 non-null  int64  
 1   jobId                999995 non-null  object 
 2   companyId            999995 non-null  int64  
 3   jobType              999995 non-null  int64  
 4   degree               999995 non-null  int64  
 5   major                999995 non-null  int64  
 6   industry             999995 non-null  int64  
 7   yearsExperience      999995 non-null  int64  
 8   milesFromMetropolis  999995 non-null  int64  
 9   salary               999995 non-null  int64  
 10  group_mean           999995 non-null  float64
 11  group_max            999995 non-null  int64  
 12  group_min            999995 non-null  int64  
 13  group_std            999995 non-null  float64
 14  group_median         999995 non-null  float64
dtypes: float64(3), in

In [3]:
# we just take 10% sample to do a fast check
train_df = train_df.sample(frac=0.1)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 123198 to 880721
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   index                100000 non-null  int64  
 1   jobId                100000 non-null  object 
 2   companyId            100000 non-null  int64  
 3   jobType              100000 non-null  int64  
 4   degree               100000 non-null  int64  
 5   major                100000 non-null  int64  
 6   industry             100000 non-null  int64  
 7   yearsExperience      100000 non-null  int64  
 8   milesFromMetropolis  100000 non-null  int64  
 9   salary               100000 non-null  int64  
 10  group_mean           100000 non-null  float64
 11  group_max            100000 non-null  int64  
 12  group_min            100000 non-null  int64  
 13  group_std            100000 non-null  float64
 14  group_median         100000 non-null  float64
dtypes: float64(3

In [5]:
X = train_df.drop(['index', 'jobId', 'salary'], axis=1)

In [6]:
y = train_df['salary']

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 123198 to 880721
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   companyId            100000 non-null  int64  
 1   jobType              100000 non-null  int64  
 2   degree               100000 non-null  int64  
 3   major                100000 non-null  int64  
 4   industry             100000 non-null  int64  
 5   yearsExperience      100000 non-null  int64  
 6   milesFromMetropolis  100000 non-null  int64  
 7   group_mean           100000 non-null  float64
 8   group_max            100000 non-null  int64  
 9   group_min            100000 non-null  int64  
 10  group_std            100000 non-null  float64
 11  group_median         100000 non-null  float64
dtypes: float64(3), int64(9)
memory usage: 9.9 MB


In [8]:
rfe = RandomForestRegressor()
np.mean(cross_val_score(rfe, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1))

-330.6383806447608

In [9]:
def rfe_cv(n_estimators, min_samples_split, max_features, max_depth):
    val = np.mean(cross_val_score(RandomForestRegressor(n_estimators=int(n_estimators), 
                                                 min_samples_split=int(min_samples_split),
                                                 max_features=min(max_features, 0.999),
                                                 max_depth=int(max_depth), random_state=42),
                         X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))
    
    return val

In [10]:
# define Bayesian Optimazation
rfe_bo = BayesianOptimization( 
             rfe_cv,
             {'n_estimators': (10, 250),
             'min_samples_split': (2, 25),
             'max_features': (0.1, 0.999),
             'max_depth': (5, 30)})

In [11]:
%%time
# start the optimazation
rfe_bo.maximize()

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-320.5   [0m | [0m 19.06   [0m | [0m 0.4986  [0m | [0m 14.94   [0m | [0m 161.8   [0m |
| [0m 2       [0m | [0m-334.4   [0m | [0m 29.57   [0m | [0m 0.2382  [0m | [0m 13.73   [0m | [0m 217.7   [0m |
| [0m 3       [0m | [0m-323.8   [0m | [0m 18.24   [0m | [0m 0.3669  [0m | [0m 4.258   [0m | [0m 194.6   [0m |
| [0m 4       [0m | [0m-371.0   [0m | [0m 29.22   [0m | [0m 0.1388  [0m | [0m 4.448   [0m | [0m 93.41   [0m |
| [95m 5       [0m | [95m-317.9   [0m | [95m 13.27   [0m | [95m 0.5016  [0m | [95m 19.71   [0m | [95m 167.6   [0m |
| [0m 6       [0m | [0m-407.0   [0m | [0m 5.0     [0m | [0m 0.999   [0m | [0m 25.0    [0m | [0m 10.0    [0m |
| [0m 7       [0m | [0m-402.9   [0m | [0m 5.0     [0m | [0m 0.999   [0m | [0m 2.0     [0m | [0m 250.0  

In [13]:
rfe_bo.max

{'target': -317.9425770507151,
 'params': {'max_depth': 13.268505574426682,
  'max_features': 0.5016082654057604,
  'min_samples_split': 19.70639834666332,
  'n_estimators': 167.58817625981817}}

In [14]:
rfe_optimized = RandomForestRegressor(n_estimators=166, max_depth=13, max_features=0.5, min_samples_split=20)
np.mean(cross_val_score(rfe_optimized, X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))

-317.69608161310316

### Try Gradient Boosting Machine

In [15]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor()
np.mean(cross_val_score(GBR, X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))

CPU times: user 35.9 ms, sys: 22.8 ms, total: 58.7 ms
Wall time: 12.4 s


-324.75907470080375

In [16]:
def GBR_cv(learning_rate, n_estimators, min_samples_split, max_features, max_depth):
    val = np.mean(cross_val_score(GradientBoostingRegressor(learning_rate=learning_rate, 
                                                            n_estimators=int(n_estimators), 
                                                            min_samples_split=int(min_samples_split),
                                                            max_features=min(max_features, 0.999),
                                                            max_depth=int(max_depth), random_state=42),
                         X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))
    
    return val

In [17]:
# define Bayesian Optimazation
GBR_bo = BayesianOptimization( 
             GBR_cv,
             {'learning_rate': (0.01, 0.1),
              'n_estimators': (10, 250),
              'min_samples_split': (2, 25),
              'max_features': (0.1, 0.999),
              'max_depth': (5, 30)})

In [19]:
%%time

GBR_bo.maximize()

|   iter    |  target   | learni... | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-317.6   [0m | [0m 0.08718 [0m | [0m 8.855   [0m | [0m 0.9865  [0m | [0m 17.37   [0m | [0m 109.7   [0m |
| [0m 2       [0m | [0m-319.2   [0m | [0m 0.03279 [0m | [0m 10.98   [0m | [0m 0.9272  [0m | [0m 10.04   [0m | [0m 118.9   [0m |
| [0m 3       [0m | [0m-357.8   [0m | [0m 0.06619 [0m | [0m 24.85   [0m | [0m 0.7723  [0m | [0m 7.659   [0m | [0m 106.5   [0m |
| [0m 4       [0m | [0m-337.6   [0m | [0m 0.06297 [0m | [0m 5.708   [0m | [0m 0.156   [0m | [0m 4.063   [0m | [0m 167.2   [0m |
| [0m 5       [0m | [0m-341.0   [0m | [0m 0.07739 [0m | [0m 14.83   [0m | [0m 0.6366  [0m | [0m 18.6    [0m | [0m 90.23   [0m |
| [0m 6       [0m | [0m-319.2   [0m | [0m 0.08453 [0m | [0m 9.583   [0m | [0m 0.4377  [0m | [0m 17.01   [0

In [21]:
GBM_Optimized = GradientBoostingRegressor(learning_rate=0.052, n_estimators=114, max_depth=8, max_features=0.3261, min_samples_split=9)
np.mean(cross_val_score(GBM_Optimized, X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))

-316.2296206209812

## Use full dataset to validate

In [22]:
train_df = pd.merge(left=pd.read_csv('data/processed/processed.csv'), 
                    right=pd.read_csv('data/raw/train_salaries.csv'), how='inner')
X = train_df.drop(['index', 'jobId', 'salary'], axis=1)
y = train_df['salary']

np.mean(cross_val_score(GBM_Optimized, X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))

-311.8021653654332

In [23]:
%%time
origin_GBM = GradientBoostingRegressor(n_estimators=100, max_depth=7)
np.mean(cross_val_score(origin_GBM, X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))

CPU times: user 167 ms, sys: 198 ms, total: 365 ms
Wall time: 8min 33s


-308.4384637200592

### Run the Bayesian Optimization again for Gradient Boosting Machine on Full Dataset

In [24]:
%%time

GBR_bo.maximize()

|   iter    |  target   | learni... | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m 31      [0m | [0m-748.8   [0m | [0m 0.01674 [0m | [0m 9.945   [0m | [0m 0.2231  [0m | [0m 15.25   [0m | [0m 35.22   [0m |
| [0m 32      [0m | [0m-353.5   [0m | [0m 0.04138 [0m | [0m 19.53   [0m | [0m 0.1813  [0m | [0m 9.367   [0m | [0m 56.44   [0m |
| [95m 33      [0m | [95m-312.3   [0m | [95m 0.04981 [0m | [95m 12.59   [0m | [95m 0.3711  [0m | [95m 13.84   [0m | [95m 74.76   [0m |
| [95m 34      [0m | [95m-310.5   [0m | [95m 0.06982 [0m | [95m 11.01   [0m | [95m 0.7724  [0m | [95m 14.58   [0m | [95m 104.0   [0m |
| [0m 35      [0m | [0m-365.3   [0m | [0m 0.04132 [0m | [0m 23.65   [0m | [0m 0.9186  [0m | [0m 7.623   [0m | [0m 68.36   [0m |
| [95m 36      [0m | [95m-308.4   [0m | [95m 0.0614  [0m | [95m 10.12   [0m | [95m 0.5852  [0m

KeyboardInterrupt: 

In [26]:
GBR_bo.max

{'target': -308.36028085451795,
 'params': {'learning_rate': 0.061403125042222846,
  'max_depth': 10.11678152909586,
  'max_features': 0.5852357853509725,
  'min_samples_split': 11.957661037092597,
  'n_estimators': 116.77488967659396}}

In [28]:
%%time
GBR_optimized = GradientBoostingRegressor(n_estimators=117, max_depth=10, learning_rate=0.0614, max_features=0.5852, min_samples_split=12)
np.mean(cross_val_score(GBR_optimized, X, y, scoring='neg_mean_squared_error', cv=5, n_jobs=-1))

CPU times: user 205 ms, sys: 208 ms, total: 413 ms
Wall time: 8min 45s


-308.4203781602086

In [29]:
model = GradientBoostingRegressor(n_estimators=117, 
                                  max_depth=10, 
                                  learning_rate=0.0614, 
                                  max_features=0.5852, min_samples_split=12)
model.fit(X, y)

GradientBoostingRegressor(learning_rate=0.0614, max_depth=10,
                          max_features=0.5852, min_samples_split=12,
                          n_estimators=117)

### Save the fine turning Gradient Boosting Regressor Model

In [30]:
import pickle

filename = 'models/GBMmodel.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)