* 使用 [BayesianOptimization](https://github.com/fmfn/BayesianOptimization) 来寻找最佳权重

In [89]:
import pandas as pd
import glob
import numpy as np
import scipy as sp
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from bayes_opt import BayesianOptimization
from tqdm import tqdm
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [6]:
train_data = pd.read_csv('../raw/train_data.csv')
test_data = pd.read_csv('../raw/test_data.csv')

In [9]:
# 去重
train_data.drop_duplicates(train_data.columns.drop('ID'), keep='first', inplace=True)
len(train_data)

8918

In [97]:
def generate_train_data(train_data, test_data, poly=False, select=False):

    y = train_data['发电量']
    X = train_data.drop(['发电量','ID'], axis=1)
    sub_data = test_data.drop(['ID'], axis=1)

    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        X = poly.fit_transform(X)
        sub_data = poly.transform(sub_data)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, y_train)
        X_test = sm.transform(X_test)
        sub_data = sm.transform(sub_data)
        
    return X_train, X_test, y_train, y_test, sub_data

In [158]:
X_train, X_test, y_train, y_test, sub_data = generate_train_data(train_data, test_data, poly=True, select=False)
print(X_train.shape, sub_data.shape)

(7134, 191) (8409, 191)


In [35]:
xgtrain = xgb.DMatrix(X, label=y)

In [36]:
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(200)])
    
    rmse = cv_result['test-rmse-mean'].values[-1]
    score = 1 / (1 + rmse)
    return score

In [37]:
if __name__ == '__main__':
    num_rounds = 4000
    random_state = 42
    num_iter = 25
    init_points = 5
    params = {
        'eta': 0.1,
        'silent': 1,
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'eval_metric': 'rmse',
        'verbose_eval': True,
        'seed': random_state,
        'tree_method': 'exact'
    }
    
    xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'max_depth': (5, 8.99),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })

    xgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[197]	train-rmse:0.195878+0.0123012	test-rmse:0.208239+0.0411462

    1 | 00m32s | [35m   0.82765[0m | [32m   5.7627[0m | [32m            0.8816[0m | [32m   8.9251[0m | [32m     8.9170[0m | [32m            4.7818[0m | [32m     0.9867[0m | 
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[687]	train-rmse:0.207072+0.0112054	test-rmse:0.217101+0.04095

    2 | 00m50s |    0.82162 |    4.8094 |             0.1524

Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[687]	train-rmse:0.200055+0.0109885	test-rmse:0.210511+0.0413763

   22 | 01m21s |    0.82610 |    0.0192 |             0.8362 |    9.9901 |      8.8984 |            19.6094 |      0.5883 | 
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[1424]	train-rmse:0.16861+0.00461573	test-rmse:0.207266+0.0414517

   23 | 02m15s |    0.82832 |    0.0844 |             0.1037 |    9.9200 |      8.8899 |             1.3355 |      0.6070 | 
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 200 rounds.
   24 | 02m06s |    0.85228 |    9.5270 |             0.2431 |    0.0244 |      5.7202 |             2.1252 |      0.5160 | 
Multiple

In [None]:
Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
30   | 02m31s |    0.86199 |    3.1606 |             0.4358 |    0.0084 |      5.3738 |            19.6339 |      0.9666 |
12   | 00m50s |    0.86102 |    0.2264 |             0.1635 |    0.0172 |      5.6864 |            18.6665 |      0.5565 |
17   | 01m53s |    0.85872 |    0.3320 |             1.0000 |    0.0000 |      8.1248 |             1.0000 |      0.5000 | 

## 调参 lightgbm

In [159]:
import lightgbm as lgb

In [160]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

In [161]:
def lgb_eval(num_leaves,
             min_data,
             sub_feature
            ):

    params['num_leaves'] = int(round(num_leaves))
    params['min_data'] = int(min_data)
    params['sub_feature'] = max(min(sub_feature, 1), 0)
    
    gbm = lgb.train(params, lgb_train, 4000)
    
    pred = gbm.predict(X_test)
    rmsetmp = sp.sqrt(sp.mean((y_test - pred) ** 2))
    score = 1 / (1 + rmsetmp)
    return score

In [162]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (20, 200),
                                        'min_data': (10, 80),
                                        'sub_feature': (0.3, 1)
                                       })

In [163]:
if __name__ == '__main__':
    num_rounds = 4000
    random_state = 42
    num_iter = 25
    init_points = 5
    params = {
        'eta': 0.1,
        'learning_rate': 0.002,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'min_hessian': 1,
        'verbose': -1
    }

In [164]:
lgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m-------------------------------------------------------------------------[0m
 Step |   Time |      Value |   min_data |   num_leaves |   sub_feature | 
    1 | 02m03s | [35m   0.89883[0m | [32m   62.4191[0m | [32m    158.5493[0m | [32m       0.6738[0m | 
    2 | 02m08s |    0.89798 |    67.4945 |     194.8964 |        0.8054 | 
    3 | 03m01s | [35m   0.89926[0m | [32m   42.4281[0m | [32m    135.4756[0m | [32m       0.6116[0m | 
    4 | 02m09s | [35m   0.89967[0m | [32m   53.3468[0m | [32m    162.9040[0m | [32m       0.3697[0m | 
    5 | 03m16s |    0.89698 |    19.8272 |     118.3053 |        0.7556 | 
[31mBayesian Optimization[0m
[94m-------------------------------------------------------------------------[0m
 Step |   Time |      Value |   min_data |   num_leaves |   sub_feature | 
    6 | 00m41s |    0.89400 |    79.7722 |      20.2748 |        0.4218 | 
    7 | 06m06s |    0.89175 |    10.2518 |     198.8416 |        0.9781

## GBRT

In [140]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cross_validation import cross_val_score

In [156]:
def gbrt_evaluate(n_estimators,
                 max_depth):
    
    est = GradientBoostingRegressor(n_estimators=int(n_estimators),
                                max_depth=int(max_depth),
                                random_state=42,
                                loss='ls',
                                learning_rate = 0.1
        ).fit(X_train, y_train)
    
    prediction3 = est.predict(X_test)
    rmse = sp.sqrt(sp.mean((y_test - prediction3) ** 2))
    score = 1 / (1 + rmse)
    return score

In [157]:
if __name__ == '__main__':
    random_state = 42
    num_iter = 25
    init_points = 5
    
    gbrtBO = BayesianOptimization(gbrt_evaluate, {'n_estimators': (100, 10000),
                                                  'max_depth': (5, 10)
                                                })

    gbrtBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   n_estimators | 
    1 | 04m17s | [35m   0.88928[0m | [32m     5.2015[0m | [32m     5611.3857[0m | 
    2 | 08m30s |    0.88767 |      9.8151 |      7358.5853 | 
    3 | 05m36s |    0.88928 |      5.3633 |      7559.6278 | 
    4 | 07m02s | [35m   0.89188[0m | [32m     6.3352[0m | [32m     8551.1300[0m | 
    5 | 05m42s |    0.88928 |      5.2315 |      7945.1096 | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   n_estimators | 
    6 | 00m11s |    0.88967 |      6.5478 |       100.0683 | 
    7 | 02m03s |    0.88927 |      5.0103 |      2344.9942 | 
    8 | 06m42s |    0.88928 |      5.1478 |      9999.7556 | 
    9 | 03m30s |    0.88928 |      5.0034 |      3893.6679 | 
   10 | 01m09s |    0.88914 |      5.0059 |      1033.2178 | 
   11 |

KeyboardInterrupt: 