# Model Selection

***
**Author:** Jiacheng

**Create Time:**  2020-01-15

**Update Time:**  2020-01-15
***

In [1]:
# 导入所需的库
from __future__ import print_function
import numpy as np
import lightgbm as lgb
import sklearn
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import colorama
from sklearn.model_selection import KFold
import pandas as pd

## 初始化

In [2]:
N_HYPEROPT_PROBES = 500
HYPEROPT_ALGO = tpe.suggest  #  tpe.suggest OR hyperopt.rand.suggest

# ----------------------------------------------------------

colorama.init()

obj_call_count = 0
cur_best_score = 0 # 0 or np.inf
log_writer = open( './log/lgb-hyperopt-log.txt', 'w' )

## 参数指定

In [3]:
def get_lgb_params(space):
    lgb_params = dict()
    lgb_params['boosting_type'] = space['boosting_type'] if 'boosting_type' in space else 'gbdt'
    lgb_params['objective'] = 'regression'
    lgb_params['metric'] = 'rmse'
    lgb_params['learning_rate'] = space['learning_rate']
    lgb_params['num_leaves'] = int(space['num_leaves'])
    lgb_params['min_data_in_leaf'] = int(space['min_data_in_leaf'])
    lgb_params['min_sum_hessian_in_leaf'] = space['min_sum_hessian_in_leaf']
    lgb_params['max_depth'] = -1
    lgb_params['lambda_l1'] = space['lambda_l1'] if 'lambda_l1' in space else 0.0
    lgb_params['lambda_l2'] = space['lambda_l2'] if 'lambda_l2' in space else 0.0
    lgb_params['max_bin'] = int(space['max_bin']) if 'max_bin' in space else 256
    lgb_params['feature_fraction'] = space['feature_fraction']
    lgb_params['bagging_fraction'] = space['bagging_fraction']
    lgb_params['bagging_freq'] = int(space['bagging_freq']) if 'bagging_freq' in space else 1
    lgb_params['nthread'] = 4
    return lgb_params

## 筛选

In [4]:
# 导入数据
X_train = pd.read_csv('./Data_Set/X_Train_Data.csv')
Y_train = pd.read_csv('./Data_Set/Y_Train_Data.csv')

In [5]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Bath,Hall,Room,Room_Bath,all_SchoolNum,all_hospitalNum,all_mall,area,buildYear,...,landMeanPrice_buildYear_cluster_mean,totalWorkers_buildYear_cluster_mean,newWorkers_buildYear_cluster_mean,residentPopulation_buildYear_cluster_mean,lookNum_buildYear_cluster_mean,trainsportNum_buildYear_cluster_mean,all_SchoolNum_buildYear_cluster_mean,all_hospitalNum_buildYear_cluster_mean,all_mall_buildYear_cluster_mean,otherNum_buildYear_cluster_mean
0,0,1,1,2,0.666667,0.737932,0.668656,0.743777,68.06,1953,...,0.0,193780.461538,7624.692308,241618.461538,3.538462,6.2379,3.651113,2.966328,1.499067,4.199459
1,1,2,2,3,0.75,0.537518,0.926687,1.250888,125.55,2007,...,790.101461,47519.215247,4568.466368,301078.412556,1.930493,5.227899,3.49143,3.114608,1.821501,4.667435
2,2,2,2,3,0.75,2.557156,1.799543,0.999678,132.0,1994,...,464.32932,59865.646658,2442.982054,344455.095297,1.292079,7.81278,3.665908,3.762189,2.409555,6.326753
3,3,1,1,1,1.0,4.099596,5.12562,1.727274,57.0,1994,...,464.32932,59865.646658,2442.982054,344455.095297,1.292079,7.81278,3.665908,3.762189,2.409555,6.326753
4,4,3,2,3,1.0,0.310106,0.361244,0.727124,129.0,1994,...,464.32932,59865.646658,2442.982054,344455.095297,1.292079,7.81278,3.665908,3.762189,2.409555,6.326753


In [6]:
Y_train.head()

Unnamed: 0.1,Unnamed: 0,tradeMoney
0,0,2000.0
1,1,2000.0
2,2,16000.0
3,3,1600.0
4,4,2900.0


In [7]:
X_train.drop('Unnamed: 0',axis=1, inplace=True)
Y_train.drop('Unnamed: 0',axis=1, inplace=True)

In [8]:
X_train.shape

(40160, 167)

In [9]:
Y_train.shape

(40160, 1)

In [10]:
def objective(space):
    global obj_call_count, cur_best_score

    obj_call_count += 1

    print('\nLightGBM objective call #{} cur_best_score={:7.5f}'.format(obj_call_count,cur_best_score) )

    lgb_params = get_lgb_params(space)

    sorted_params = sorted(space.items(), key=lambda z: z[0])
    params_str = str.join(' ', ['{}={}'.format(k, v) for k, v in sorted_params])
    print('Params: {}'.format(params_str) )
    
    kf = KFold(n_splits=3, shuffle=True, random_state=0)
    out_of_fold = np.zeros(len(X_train))
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        D_train = lgb.Dataset(X_train.iloc[train_idx], label=Y_train[train_idx])
        D_val = lgb.Dataset(X_train.iloc[val_idx], label=Y_train[val_idx])
        # Train
        num_round = 10000
        clf = lgb.train(lgb_params,
                           D_train,
                           num_boost_round=num_round,
                           # metrics='mlogloss',
                           valid_sets=D_val,
                           # valid_names='val',
                           # fobj=None,
                           # feval=None,
                           # init_model=None,
                           # feature_name='auto',
                           # categorical_feature='auto',
                           early_stopping_rounds=200,
                           # evals_result=None,
                           verbose_eval=False,
                           # learning_rates=None,
                           # keep_training_booster=False,
                           # callbacks=None
                           )
        # predict
        nb_trees = clf.best_iteration
        val_loss = clf.best_score['valid_0']
        print('nb_trees={} val_loss={}'.format(nb_trees, val_loss))
        out_of_fold[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=nb_trees)
        score = r2_score(out_of_fold, Y_train)

    print('val_r2_score={}'.format(score))

    log_writer.write('score={} Params:{} nb_trees={}\n'.format(score, params_str, nb_trees ))
    log_writer.flush()

    if score>cur_best_score:
        cur_best_score = score
        print(colorama.Fore.GREEN + 'NEW BEST SCORE={}'.format(cur_best_score) + colorama.Fore.RESET)
    return {'loss': -score, 'status': STATUS_OK}

## 主函数

In [11]:
space ={
        'num_leaves': hp.quniform ('num_leaves', 10, 100, 1),
        'min_data_in_leaf':  hp.quniform ('min_data_in_leaf', 10, 100, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.75, 1.0),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.75, 1.0),
        'learning_rate': hp.uniform('learning_rate', 0, 0.01),
#         'learning_rate': hp.loguniform('learning_rate', -5.0, -2.3),
        'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', 0, 2.3),
        'max_bin': hp.quniform ('max_bin', 88, 200, 1),
        'bagging_freq': hp.quniform ('bagging_freq', 1, 15, 1),
        'lambda_l1': hp.uniform('lambda_l1', 0, 10 ),
        'lambda_l2': hp.uniform('lambda_l2', 0, 10 ),
       }

In [12]:
trials = Trials()
best = hyperopt.fmin(fn=objective,
                     space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=trials,
                     verbose=1)

print('-'*50)
print('The best params:')
print( best )
print('\n\n')


LightGBM objective call #1 cur_best_score=0.00000
Params: bagging_fraction=0.7822354392126312 bagging_freq=3.0 feature_fraction=0.9978968397776449 lambda_l1=9.182167566109555 lambda_l2=3.564770643400561 learning_rate=0.0037696831227920337 max_bin=148.0 min_data_in_leaf=64.0 min_sum_hessian_in_leaf=9.293793877184154 num_leaves=84.0


KeyError: "None of [Int64Index([    0,     1,     2,     3,     5,     8,     9,    10,    11,\n               12,\n            ...\n            40142, 40143, 40144, 40146, 40147, 40148, 40152, 40154, 40155,\n            40157],\n           dtype='int64', length=26773)] are in the [columns]"