## Santander Value Prediction Challenge
Predict the value of transactions for potential customers.
### Calculation

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb

from mlxtend.regressor import StackingRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor

In [6]:
data = pd.read_csv('dataset//train.csv', sep=',')
Y = data['target']
X_train = pd.read_csv('results//without_outliers_X_train.csv', sep=',')
X_test = pd.read_csv('results//without_outliers_X_test.csv', sep=',')

In [7]:
Y_train = np.log1p(Y)

### Select the most important features

In [8]:
%%time
def rmse(y, pred):
    return np.sqrt(np.mean(np.power(y - pred, 2)))

X1, X2, Y1, Y2 = train_test_split(X_train, Y_train, test_size=0.20, random_state=42)

rfRegressor = RandomForestRegressor(n_estimators=100, verbose=1, random_state=42)
rfRegressor.fit(X1, Y1)
print(rmse(Y2, rfRegressor.predict(X2)))

cols = pd.DataFrame({'importance': rfRegressor.feature_importances_, 'feature': X_train.columns}).sort_values(
    by=['importance'], ascending=[False])[:1000]['feature'].values
X_train = X_train[cols]
X_test = X_test[cols]

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 12.3min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


1.423839325790084
Wall time: 12min 23s


### Stacking regressors

In [9]:
%%time
def RMSE(Ypreds, Ytrue):
    result = np.sum(np.mean((Ytrue - Ypreds)**2))
    return result

adaRegressor = AdaBoostRegressor(n_estimators=300, random_state=42)
gbRegressor = GradientBoostingRegressor(n_estimators=300, random_state=42)
rfRegressor = RandomForestRegressor(n_estimators=100, random_state=42)
exTreeRegressor = ExtraTreesRegressor(n_estimators=10, random_state=42)
baggingRegressor = BaggingRegressor(n_estimators=20, random_state=42)
# ======================================================================================
stack = StackingRegressor(regressors=[adaRegressor, gbRegressor, rfRegressor, exTreeRegressor, baggingRegressor], 
                          meta_regressor=gbRegressor, verbose=1)
stack.fit(X_train, Y_train)

print("Log Mean Squared Error: %.12f" % RMSE(stack.predict(X_train), Y_train))

Y_stack_preds = np.expm1(stack.predict(X_test))

Fitting 5 regressors...
Fitting regressor1: adaboostregressor (1/5)
Fitting regressor2: gradientboostingregressor (2/5)
Fitting regressor3: randomforestregressor (3/5)
Fitting regressor4: extratreesregressor (4/5)
Fitting regressor5: baggingregressor (5/5)
Log Mean Squared Error: 0.002599675447
Wall time: 9min 1s


### lightGBM tuning

In [12]:
print(lgb.__version__)

estimator = lgb.LGBMRegressor(boosting_type='gbdt',  
                              n_estimators=300, objective=None, 
                              random_state=42, n_jobs=-1, silent=False)

space = {
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'num_leaves': Integer(3, 100),      
    'max_depth': Integer(0, 50),
    'min_child_samples': Integer(0, 50),
    'max_bin': Integer(100, 1000),
    'subsample': Real(0.01, 1.0, 'uniform'),
    'subsample_freq': Integer(0, 10),
    'colsample_bytree': Real(0.01, 1.0, 'uniform'),
    'min_child_weight': Real(0, 10),
    'subsample_for_bin': Integer(100000, 500000),
    'reg_lambda': Real(1e-9, 1000, 'log-uniform'),
    'reg_alpha': Real(1e-9, 1.0, 'log-uniform'),
    'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
}
    
cv = KFold(n_splits=3, shuffle=True, random_state=42)

BayesGridCV = BayesSearchCV(estimator, space, scoring='neg_mean_squared_error', cv=cv, n_iter=10, verbose=1, refit=True, 
               random_state=42, return_train_score=True)

def print_status(result):
    print('Best RMSE: {}\nBest params: {}\n'.format(
        np.round(BayesGridCV.best_score_, 7),
        BayesGridCV.best_params_))
    
result = BayesGridCV.fit(X_train, Y_train, callback=print_status)

2.1.2
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   59.6s finished


Best RMSE: -2.7770052
Best params: {'colsample_bytree': 0.4160029192647807, 'learning_rate': 0.28539836866041823, 'max_bin': 940, 'max_depth': 16, 'min_child_samples': 34, 'min_child_weight': 4.141186324855385, 'num_leaves': 37, 'reg_alpha': 0.004524161584138917, 'reg_lambda': 4.5035991909114364e-06, 'scale_pos_weight': 0.4316379249903662, 'subsample': 0.5544643023916863, 'subsample_for_bin': 150734, 'subsample_freq': 2}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.3min finished


Best RMSE: -2.7770052
Best params: {'colsample_bytree': 0.4160029192647807, 'learning_rate': 0.28539836866041823, 'max_bin': 940, 'max_depth': 16, 'min_child_samples': 34, 'min_child_weight': 4.141186324855385, 'num_leaves': 37, 'reg_alpha': 0.004524161584138917, 'reg_lambda': 4.5035991909114364e-06, 'scale_pos_weight': 0.4316379249903662, 'subsample': 0.5544643023916863, 'subsample_for_bin': 150734, 'subsample_freq': 2}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.3min finished


Best RMSE: -2.7770052
Best params: {'colsample_bytree': 0.4160029192647807, 'learning_rate': 0.28539836866041823, 'max_bin': 940, 'max_depth': 16, 'min_child_samples': 34, 'min_child_weight': 4.141186324855385, 'num_leaves': 37, 'reg_alpha': 0.004524161584138917, 'reg_lambda': 4.5035991909114364e-06, 'scale_pos_weight': 0.4316379249903662, 'subsample': 0.5544643023916863, 'subsample_for_bin': 150734, 'subsample_freq': 2}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   47.0s finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.1min finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.3min finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.7s finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   23.6s finished


Best RMSE: -2.1481775
Best params: {'colsample_bytree': 0.8142720284737898, 'learning_rate': 0.022066991249460103, 'max_bin': 638, 'max_depth': 40, 'min_child_samples': 26, 'min_child_weight': 0.9545503921499345, 'num_leaves': 76, 'reg_alpha': 0.07139588474544915, 'reg_lambda': 87.7296381526307, 'scale_pos_weight': 0.002315903191638441, 'subsample': 0.34817978468161015, 'subsample_for_bin': 435012, 'subsample_freq': 6}



In [18]:
%%time

print(lgb.__version__)
dtrain = lgb.Dataset(data=X_train, label=Y_train, free_raw_data=False)
dtrain.construct()            

lgb_params = {
    'objective': 'regression',
    'num_leaves': 76,
    'subsample': 0.34817978468161015,
    'subsample_for_bin': 435012,
    'subsample_freq': 6,
    'colsample_bytree': 0.8142720284737898,
#     'min_split_gain': 0.7453280360438532,
    'reg_alpha': 0.07139588474544915,
    'reg_lambda': 87.7296381526307,
    'min_child_weight': 0.9545503921499345,
    'min_child_samples': 26,    
    'verbose': 1,
    'seed': 42,
    'boosting_type': 'gbdt',
    'max_bin': 638,
    'max_depth': 40,
    'learning_rate': 0.022066991249460103,
    'scale_pos_weight': 0.002315903191638441,
    'metric': 'rmse',
}

model = lgb.train(lgb_params, dtrain, 1000)
print("Mean Squared Error: %.12f" % mean_squared_error(model.predict(X_train, num_iteration=-1), Y_train))

Y_lgb_preds = np.expm1(model.predict(X_test, num_iteration=-1))

2.1.2
Mean Squared Error: 1.119388265702
Wall time: 1min 52s


### XGboost tuning

In [14]:
%%time
print(xgb.__version__)

estimator = xgb.XGBRegressor(n_estimators=300, silent=False, objective='reg:linear', booster='gbtree', n_jobs=1, random_state=42)
    
space = {'max_depth' : Integer(3, 20), 
         'learning_rate' :Real(0.01, 1.0, 'log-uniform'), 
         'gamma' : Real(1e-9, 0.5, 'log-uniform'), 
         'min_child_weight' : Integer(0, 10), 
         'max_delta_step' : Integer(0, 10),
         'subsample' : Real(0.01, 1.0, 'uniform'), 
         'colsample_bytree' : Real(0.01, 1.0, 'uniform'), 
         'colsample_bylevel' : Real(0.01, 1.0, 'uniform'), 
         'reg_alpha' : Real(1e-9, 1.0, 'log-uniform'), 
         'reg_lambda' : Real(1e-9, 1000, 'log-uniform'),
         'scale_pos_weight' : Real(1e-6, 500, 'log-uniform')
        } 

cv = KFold(n_splits=3, shuffle=True, random_state=42)

BayesGridCV = BayesSearchCV(estimator, space, scoring='neg_mean_squared_error', cv=cv, n_iter=10, verbose=1, refit=True, 
               random_state=42, return_train_score=True)

def print_status(result):
    print('Best RMSE: {}\nBest params: {}\n'.format(
        np.round(BayesGridCV.best_score_, 7),
        BayesGridCV.best_params_))
    
result = BayesGridCV.fit(X_train, Y_train, callback=print_status)

0.72
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.1s finished


Best RMSE: -198.7919227
Best params: {'colsample_bylevel': 0.4160029192647807, 'colsample_bytree': 0.7304484857455519, 'gamma': 0.13031389926541354, 'learning_rate': 0.042815319280763466, 'max_delta_step': 7, 'max_depth': 10, 'min_child_weight': 4, 'reg_alpha': 0.004524161584138917, 'reg_lambda': 4.5035991909114364e-06, 'scale_pos_weight': 0.4316379249903662, 'subsample': 0.5544643023916863}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min finished


Best RMSE: -5.5075192
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 9, 'max_depth': 4, 'min_child_weight': 1, 'reg_alpha': 1.521551227197179e-06, 'reg_lambda': 0.042535272805117035, 'scale_pos_weight': 196.9224481160877, 'subsample': 0.6336020558163782}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.9s finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.0s finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.4s finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.6s finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.0min finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.6s finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   18.9s finished


Best RMSE: -2.1399253
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, 'learning_rate': 0.07356404539935663, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 2, 'reg_alpha': 0.005807280212192344, 'reg_lambda': 0.004876240041754427, 'scale_pos_weight': 292.346774761682, 'subsample': 0.7064328557952411}

Wall time: 8min 30s


In [15]:
%%time
print(xgb.__version__)

params = {'colsample_bylevel': 0.4503841871781403, 
          'colsample_bytree': 0.9195352964526833, 
          'gamma': 8.168958221061441e-09, 
          'learning_rate': 0.07356404539935663, 
          'max_delta_step': 2, 
          'max_depth': 11, 
          'min_child_weight': 2, 
          'reg_alpha': 0.005807280212192344, 
          'reg_lambda': 0.004876240041754427, 
          'scale_pos_weight': 292.346774761682, 
          'subsample': 0.7064328557952411,
          'silent' : False, 
          'objective' : 'reg:linear',
          'eval_metric': 'rmse',
          'booster' : 'gbtree', 
          'n_jobs' : 1, 
          'random_state' : 42
         }

dtrain = xgb.DMatrix(X_train, label=Y_train)
model_xgb = xgb.train(params, dtrain, 1000)

print("Mean squared error", mean_squared_error(model_xgb.predict(dtrain, ntree_limit=-1), Y_train))
dtest = xgb.DMatrix(X_test)
Y_predict_xgb = np.expm1(model_xgb.predict(dtest, ntree_limit=-1))

0.72
Mean squared error 0.023820462437301456
Wall time: 5min 13s


In [17]:
Y_predict_avr = (Y_stack_preds + Y_lgb_preds + Y_predict_xgb) / 3
submission = pd.read_csv('dataset//sample_submission.csv')
submission["target"] = Y_predict_avr
submission.to_csv('results//submission.csv', index=False)