In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import scipy as sp
from bayes_opt import BayesianOptimization

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import GradientBoostingRegressor

# PolynomialFeatures

In [19]:
train_data=pd.read_csv('train_data_li.csv')
test_data=pd.read_csv('test_data_li.csv')

In [6]:
def generate_train_data(train_data, test_data, poly=False, select=False):

    y = train_data['power_generation']
    X = train_data.drop(['power_generation','ID'], axis=1)
    sub_data = test_data.drop(['ID'], axis=1)

    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        poly = PolynomialFeatures(degree=3, interaction_only=True)
        X = poly.fit_transform(X)
        sub_data = poly.transform(sub_data)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=42)

    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, y_train)
        X_test = sm.transform(X_test)
        sub_data = sm.transform(sub_data)
        
    return X_train, X_test, y_train, y_test, sub_data

In [7]:
X_train, X_test, y_train, y_test, sub_data = generate_train_data(train_data, test_data, poly=True, select=False)
print(X_train.shape, sub_data.shape)

(9000, 1160) (8409, 1160)


In [9]:
#利用贝叶斯找到生成伪标签最优解
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

In [53]:
def lgb_eval(num_leaves,
             min_data,
             sub_feature
            ):

    params['num_leaves'] = int(round(num_leaves))
    params['min_data'] = int(min_data)
    params['sub_feature'] = max(min(sub_feature, 1), 0)
    
    gbm = lgb.train(params, lgb_train, 4000)
    
    pred = gbm.predict(X_test)
    rmsetmp = sp.sqrt(sp.mean((y_test - pred) ** 2))
    score = 1 / (1 + rmsetmp)
    return score

In [54]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (20, 200),
                                        'min_data': (10, 80),
                                        'sub_feature': (0.3, 1)
                                       })

In [55]:
if __name__ == '__main__':
    num_rounds = 4000
    random_state = 42
    num_iter = 25
    init_points = 5
    params = {
        'eta': 0.1,
        'learning_rate': 0.002,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'min_hessian': 1,
        'verbose': -1
    }

In [None]:
lgbBO.maximize(init_points=init_points, n_iter=num_iter)

In [None]:
# 最优参数
13 | 07m29s |    0.89206 |    78.1296 |     146.5010 |        0.8468 | 

In [10]:
#设置参数
params = {
        'eta': 0.1,
        'learning_rate': 0.002,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'min_hessian': 1,
        'verbose': -1,
        'num_leaves':146,
        'min_data':78,
        'sub_feature': 0.8468 
    
    }
gbm = lgb.train(params, lgb_train, 4000)

In [11]:
pred = gbm.predict(sub_data)

In [48]:
test_data['power_generation']=pred

In [49]:

train_test=pd.concat([train_data,test_data],axis=0)

In [50]:
train_test.reset_index(inplace=True)
del train_test['index']

In [41]:
#之后需要用到没有'power_generation'的测试数据
test_data=pd.read_csv('test_data_li.csv')

In [42]:
X_train, X_test, y_train, y_test, sub_data = generate_train_data(train_test, test_data, poly=True, select=False)
print(X_train.shape, sub_data.shape)

(17409, 1160) (8409, 1160)


In [43]:
lgb_train = lgb.Dataset(X_train, y_train)

In [51]:
X_train, y_train=train_test[500:].drop(['power_generation','ID'], axis=1),train_test[500:]['power_generation']
X_test, y_test=train_test[:500].drop(['power_generation','ID'], axis=1),train_test[:500]['power_generation']                                                                                          

In [52]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

In [56]:
#再次贝叶斯优化
lgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m-------------------------------------------------------------------------[0m
 Step |   Time |      Value |   min_data |   num_leaves |   sub_feature | 
    1 | 00m32s | [35m   0.91767[0m | [32m   22.4883[0m | [32m     32.6694[0m | [32m       0.9537[0m | 
    2 | 01m50s | [35m   0.92801[0m | [32m   51.2051[0m | [32m    140.0247[0m | [32m       0.6050[0m | 
    3 | 01m37s |    0.92641 |    29.9939 |     121.1456 |        0.4186 | 
    4 | 00m47s |    0.91925 |    32.1580 |      55.8158 |        0.8057 | 
    5 | 02m27s |    0.92722 |    26.6798 |     184.5485 |        0.6315 | 




[31mBayesian Optimization[0m
[94m-------------------------------------------------------------------------[0m
 Step |   Time |      Value |   min_data |   num_leaves |   sub_feature | 
    6 | 02m08s |    0.92176 |    41.5757 |     149.1294 |        0.8012 | 
    7 | 02m06s |    0.91828 |    33.2037 |     123.9250 |        0.9353 | 
    8 | 02m00s |    0.92608 |    53.2652 |     148.0737 |        0.6753 | 
    9 | 02m13s |    0.92061 |    63.3857 |     168.4527 |        0.8093 | 
   10 | 01m36s |    0.92763 |    24.0032 |     108.8791 |        0.5651 | 
   11 | 02m03s |    0.92042 |    78.5185 |     192.5458 |        0.3129 | 
   12 | 02m17s |    0.91937 |    41.4448 |     160.1658 |        0.8553 | 


  " state: %s" % convergence_dict)


   13 | 00m40s |    0.91936 |    64.9610 |      33.7640 |        0.7616 | 
   14 | 01m13s |    0.91694 |    49.5235 |      74.0874 |        0.8464 | 
   15 | 01m24s |    0.92380 |    44.6785 |      87.2022 |        0.3296 | 
   16 | 01m44s |    0.92772 |    74.2755 |     118.7482 |        0.5690 | 


  " state: %s" % convergence_dict)


   17 | 00m31s |    0.92146 |    68.2733 |      23.8131 |        0.4723 | 
   18 | 00m45s |    0.92139 |    77.2690 |      40.7473 |        0.6320 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   19 | 02m24s |    0.92430 |    59.1482 |     184.3563 |        0.3735 | 
   20 | 01m34s |    0.92549 |    51.7945 |     102.6817 |        0.6969 | 


  " state: %s" % convergence_dict)


   21 | 01m17s |    0.91894 |    14.5095 |      80.2115 |        0.8977 | 
   22 | 01m38s |    0.92436 |    77.9846 |     112.8531 |        0.6600 | 
   23 | 01m31s | [35m   0.92849[0m | [32m   43.6916[0m | [32m     99.9462[0m | [32m       0.5663[0m | 
   24 | 01m33s |    0.92741 |    43.5839 |     100.0039 |        0.6776 | 
   25 | 01m31s |    0.92446 |    43.4650 |      99.9674 |        0.7687 | 
   26 | 01m15s |    0.91556 |    63.1432 |      83.4976 |        0.9792 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   27 | 01m21s |    0.91712 |    52.9904 |      90.1306 |        0.8981 | 


  " state: %s" % convergence_dict)


   28 | 01m29s |    0.92723 |    43.7271 |     101.0440 |        0.6753 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   29 | 01m28s |    0.91785 |    44.0734 |     100.1003 |        0.9429 | 
   30 | 01m24s |    0.92092 |    34.4857 |      96.0953 |        0.9412 | 


In [None]:
#加上伪标签找到最优参数
23 | 01m31s |    0.92849 |    43.6916 |      99.9462 |        0.5663 | 

In [59]:
X_train, y_train=train_test.drop(['power_generation','ID'],axis=1),train_test['power_generation']
subdata=train_test[9000:].drop(['power_generation','ID'],axis=1)#测试数据

In [63]:
sub_data=train_test[9000:].drop(['power_generation','ID'],axis=1)

In [60]:
lgb_train = lgb.Dataset(X_train, y_train)
#subdata = lgb.Dataset(subdata)

In [61]:
params = {
        'eta': 0.1,
        'learning_rate': 0.002,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'min_hessian': 1,
        'verbose': -1,
        'num_leaves':100,
        'min_data':43,
        'sub_feature': 0.5663 
    
    }
gbm = lgb.train(params, lgb_train, 4000)

In [64]:
pred3 = gbm.predict(sub_data)

In [79]:

ensem=pd.read_csv('submit/result_20180810c_ensemble.csv',header=None)

In [81]:
ensem.columns=['id','power']

In [83]:
ensem['power'].values

array([ 0.37999305,  1.30497943,  2.14266863, ...,  9.97958111,
        9.85703451,  9.16006358])

In [84]:
#对一下和线上最好成绩的差距
rmsetmp1 = sp.sqrt(sp.mean((ensem['power'].values - pred3) ** 2))
score1 = 1 / (1 + rmsetmp1)

In [86]:
rmsetmp1

0.037244933763835655

In [87]:
submit_data=pd.read_csv('submit_example.csv',header=None)
submit_data.columns=['ID','power_generation']

In [88]:
submit_data['power_generation']=pred3
submit_data.to_csv('submit/result_0817_lgbm_pesudo_1.csv',index=False, header=None) 