In [1]:
import math
import numpy as np
import pandas as pd
import sklearn.linear_model as skl
import lightgbm as lgb
import xgboost as xgb
%matplotlib inline


from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report 
from sklearn.metrics import mean_squared_log_error
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, Imputer
from scipy.stats import norm
from datetime import datetime, date

In C:\Users\WANGJIANMING\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In C:\Users\WANGJIANMING\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Users\WANGJIANMING\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The pgf.debug rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In C:\Users\WANGJIANMING\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.level rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Users\WANGJIANMING\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.fileo rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


## Initiate Training Set and Test Set: Basic Data Preprocessing

In [2]:
#set time stamp as the index and sort it by the order 

#1. training set
df = pd.read_csv("train.csv")
df.isnull().any()
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by = 'date')
df = df.set_index('date')

#2. test set
ts = pd.read_csv("test.csv")
ts.isnull().any()
ts['date'] = pd.to_datetime(ts['date'])
ts = ts.sort_values(by = 'date')
ts = ts.set_index('date')
ts_temp = ts

#



## Feature Engineering 

In [3]:
#Hong Kong holiday information
holiday_2017 = [(1,2),(1,28),(1,30),(1,31),(4,4),(4,14),(4,15),(4,17),(5,1),(5,3),(5,30),(7,1),(10,2),(10,5),(10,28),(12,25),(12,26)]
holiday_2018 = [(1,1),(2,16),(2,17),(2,19),(3,30),(3,31),(4,2),(4,5),(5,1),(5,22),(6,18),(7,2),(9,25),(10,1),(10,17),(12,25),(12,26)]

In [4]:
# holiday variable: judge whether this day is holiday or not by checking the holiday array 
def f_holiday(x):
    if int(x.split("-")[0]) == 2017:
        temp = (int(x.split("-")[1]),int(x.split("-")[2]))
        return 1 if temp in holiday_2017 else 0
    if int(x.split("-")[0]) == 2018:
        temp = (int(x.split("-")[1]),int(x.split("-")[2]))
        return 1 if temp in holiday_2018 else 0
    

# weekday variable: check whether this day is sunday or not
def f_weekday(x):
    return datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday()


# workday variable: form the dummy variable for working days and sunday
def f_workday(x):
    return 1 if x < 6 else 0
def f_sunday(x):
    return 1 if x >= 6 else 0


# work-night hour variable
def f_work_hour(x):
    return 1 if x>=7 and x<=19 else 0
def f_night_hour(x):
    return 1 if x<7 or x>19 else 0


# month period variable
def f_month_period0_divide(x):
    return 1 if x<=10 else 0
def f_month_period1_divide(x):
    return 1 if x>10 and x<=20 else 0
def f_month_period2_divide(x):
    return 1 if x>20 else 0


# season divide variable
def f_season0_divide(x):
    return 1 if x>=1 and x<=3 else 0
def f_season1_divide(x):
    return 1 if x>=4 and x<=6 else 0
def f_season2_divide(x):
    return 1 if x>=7 and x<=9 else 0
def f_season3_divide(x):
    return 1 if x>=10 and x<=12 else 0


# ith week variable in its month: 
# find the day correponding to which week (Ath week) in this year 
# then find the 1th day of its month correponding to which week (Bth week) in this year 
# then the A-B+1 th week is the answer 
def f_week_of_month(year, month, day):
    begin = int(date(int(year), int(month), 1).strftime("%W"))
    end = int(date(int(year), int(month), int(day)).strftime("%W"))
    return end - begin + 1


In [5]:
# preprocessing the time stamp variables into corresponding variables 
ts.index = ts.index.map(str)
ts['weekday'] = ts.index.map(lambda x: f_weekday(str(x)))
ts_workday_data = ts
ts_workday_data.index = ts_workday_data.index.map(str)
ts_workday_data['days'] = ts_workday_data.index.map(lambda x:x.split(" ")[0])
ts_workday_data['hours'] = ts_workday_data.index.map(lambda x:x.split(" ")[1])
ts_workday_data['year'] = ts_workday_data['days'].map(lambda x:x.split("-")[0])
ts_workday_data['month'] = ts_workday_data['days'].map(lambda x:x.split("-")[1])
ts_workday_data['day'] = ts_workday_data['days'].map(lambda x:x.split("-")[2])
ts_workday_data['hour'] = ts_workday_data['hours'].map(lambda x:x.split(":")[0])
ts_workday_data = ts_workday_data.astype({"hour": int, "day": int, "month":int, "year":int})


# workday variable
ts_workday_data['workday'] = ts_workday_data['weekday'].map(lambda x: f_workday(x))
ts_workday_data['sunday'] = ts_workday_data['weekday'].map(lambda x: f_sunday(x))
ts_workday_data['holiday'] = ts_workday_data['days'].map(lambda x: f_holiday(x))
    

# work-night hour dummy variable    
ts_workday_data['work_hour'] = ts_workday_data['hour'].map(lambda x: f_work_hour(x))
ts_workday_data['night_hour'] = ts_workday_data['hour'].map(lambda x: f_night_hour(x))


# month period dummy variable: 
ts_workday_data['month_period0_divide'] = ts_workday_data['day'].map(lambda x: f_month_period0_divide(x))
ts_workday_data['month_period1_divide'] = ts_workday_data['day'].map(lambda x: f_month_period1_divide(x))
ts_workday_data['month_period2_divide'] = ts_workday_data['day'].map(lambda x: f_month_period2_divide(x))


# season dummy variable
ts_workday_data['season0_divide'] = ts_workday_data['month'].map(lambda x: f_season0_divide(x))
ts_workday_data['season1_divide'] = ts_workday_data['month'].map(lambda x: f_season1_divide(x))
ts_workday_data['season2_divide'] = ts_workday_data['month'].map(lambda x: f_season2_divide(x))
ts_workday_data['season3_divide'] = ts_workday_data['month'].map(lambda x: f_season3_divide(x))


# ith week variable in its month 
ts_workday_data['week_of_month'] = ts_workday_data['days'].map(lambda x: f_week_of_month(x.split("-")[0], x.split("-")[1], x.split("-")[2]))


# dropping the some un-related variables
ts_workday_data = ts_workday_data.drop('hours', 1)
ts_workday_data = ts_workday_data.drop('days', 1)
ts_workday_data = ts_workday_data.drop('id',1)
test_workday_temp = ts_workday_data
test_workday_temp.head()


Unnamed: 0_level_0,weekday,year,month,day,hour,workday,sunday,holiday,work_hour,night_hour,month_period0_divide,month_period1_divide,month_period2_divide,season0_divide,season1_divide,season2_divide,season3_divide,week_of_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-01 02:00:00,0,2018,1,1,2,1,0,1,0,1,1,0,0,1,0,0,0,1
2018-01-01 05:00:00,0,2018,1,1,5,1,0,1,0,1,1,0,0,1,0,0,0,1
2018-01-01 07:00:00,0,2018,1,1,7,1,0,1,1,0,1,0,0,1,0,0,0,1
2018-01-01 08:00:00,0,2018,1,1,8,1,0,1,1,0,1,0,0,1,0,0,0,1
2018-01-01 10:00:00,0,2018,1,1,10,1,0,1,1,0,1,0,0,1,0,0,0,1


In [6]:
# preprocessing the time stamp variables into corresponding variables 
df.index = df.index.map(str)
df['weekday'] = df.index.map(lambda x: f_weekday(str(x)))
train_sunday_data = df
train_sunday_data.index = train_sunday_data.index.map(str)
train_sunday_data['days'] = train_sunday_data.index.map(lambda x:x.split(" ")[0])
train_sunday_data['hours'] = train_sunday_data.index.map(lambda x:x.split(" ")[1])
train_sunday_data['year'] = train_sunday_data['days'].map(lambda x:x.split("-")[0])
train_sunday_data['month'] = train_sunday_data['days'].map(lambda x:x.split("-")[1])
train_sunday_data['day'] = train_sunday_data['days'].map(lambda x:x.split("-")[2])
train_sunday_data['hour'] = train_sunday_data['hours'].map(lambda x:x.split(":")[0])
train_sunday_data = train_sunday_data.astype({"hour": int, "day": int, "month":int, "year":int})


# workday variable
train_sunday_data['workday'] = train_sunday_data['weekday'].map(lambda x: f_workday(x))
train_sunday_data['sunday'] = train_sunday_data['weekday'].map(lambda x: f_sunday(x))
train_sunday_data['holiday'] = train_sunday_data['days'].map(lambda x: f_holiday(x))

# work-night hour dummy variable
train_sunday_data['work_hour'] = train_sunday_data['hour'].map(lambda x: f_work_hour(x))
train_sunday_data['night_hour'] = train_sunday_data['hour'].map(lambda x: f_night_hour(x))


# month period dummy variable: 
train_sunday_data['month_period0_divide'] = train_sunday_data['day'].map(lambda x: f_month_period0_divide(x))
train_sunday_data['month_period1_divide'] = train_sunday_data['day'].map(lambda x: f_month_period1_divide(x))
train_sunday_data['month_period2_divide'] = train_sunday_data['day'].map(lambda x: f_month_period2_divide(x))


# season dummy variable
train_sunday_data['season0_divide'] = train_sunday_data['month'].map(lambda x: f_season0_divide(x))
train_sunday_data['season1_divide'] = train_sunday_data['month'].map(lambda x: f_season1_divide(x))
train_sunday_data['season2_divide'] = train_sunday_data['month'].map(lambda x: f_season2_divide(x))
train_sunday_data['season3_divide'] = train_sunday_data['month'].map(lambda x: f_season3_divide(x))

# ith week variable in its month 
train_sunday_data['week_of_month'] = train_sunday_data['days'].map(lambda x: f_week_of_month(x.split("-")[0], x.split("-")[1], x.split("-")[2]))


# dropping the some un-related variables 
train_sunday_data = train_sunday_data.drop('hours', 1)
train_sunday_data = train_sunday_data.drop('days', 1)
train_sunday_data = train_sunday_data.drop('id',1)


train_sunday_temp = train_sunday_data
train_sunday_temp.head()


Unnamed: 0_level_0,speed,weekday,year,month,day,hour,workday,sunday,holiday,work_hour,night_hour,month_period0_divide,month_period1_divide,month_period2_divide,season0_divide,season1_divide,season2_divide,season3_divide,week_of_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-01-01 00:00:00,43.00293,6,2017,1,1,0,0,1,0,0,1,1,0,0,1,0,0,0,1
2017-01-01 01:00:00,46.118696,6,2017,1,1,1,0,1,0,0,1,1,0,0,1,0,0,0,1
2017-01-01 02:00:00,44.294158,6,2017,1,1,2,0,1,0,0,1,1,0,0,1,0,0,0,1
2017-01-01 03:00:00,41.067468,6,2017,1,1,3,0,1,0,0,1,1,0,0,1,0,0,0,1
2017-01-01 04:00:00,46.448653,6,2017,1,1,4,0,1,0,0,1,1,0,0,1,0,0,0,1


## Dividing Training Dataset 

In [7]:
y = train_sunday_temp["speed"]
train_sunday_temp.drop(["speed"],axis=1,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(train_sunday_temp, y, test_size=0.2, random_state=42)

## XGboost--Directly Use RandomizedSearch

In [9]:
# import xgboost as xgb
from xgboost.sklearn import XGBRegressor
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              #'objective':['reg:linear'],
              'learning_rate': [0.001,.03, 0.05,.01,.1], #so called `eta` value
              'max_depth': [5,10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'min_child_weight': [1,2,3,4,5,6,7,8,9,10],
              #'silent': [1],
              'subsample': [0.6, 0.7, 0.8, 0.9],
              'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
              'n_estimators': [400,600,800,1000,2000,3000],
              'gamma':[0.1,0.3,0.5,0.6],
              'reg_alpha': [0.05, 0.1, 1, 2], 
              'reg_lambda': [0.05, 0.1, 1, 2]}

clf = RandomizedSearchCV(xgb1,
                        parameters,
                        cv = 10,
                        n_jobs = -1,
                        verbose=True,scoring = "neg_mean_squared_error")
clf.fit(X_train,y_train)
print(clf.best_estimator_)
print(clf.best_score_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 14.4min finished


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0.6, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.05, max_delta_step=0, max_depth=100,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=600, n_jobs=4, nthread=4, num_parallel_tree=1,
       objective='reg:squarederror', random_state=0, reg_alpha=1,
       reg_lambda=2, scale_pos_weight=1, subsample=0.6,
       tree_method='exact', validate_parameters=1, verbosity=None)
-14.350788604335628


## XGboost--Directly Use RandomizedSearch--Prediction and Output

In [12]:
# using best parameters got by the above RandomizedSearch to fit the model 
model = xgb.XGBRegressor(learning_rate=0.03, n_estimators=300, max_depth=80, min_child_weight=1, seed=0,
                             subsample=0.7, colsample_bytree=0.8, gamma=0.1, reg_alpha=0.05, reg_lambda=2)
model.fit(X_train, y_train)
y_pred = model.predict(test_workday_temp)


# (id, speed prediction) is the formate we need to save into result.csv file
test_workday_temp['speed'] = y_pred
day_pred = test_workday_temp['speed']
df_test = ts_temp.join(day_pred)
df_test = df_test.set_index('id')


df_test.to_csv("test.csv", columns=["speed"], index=True)

## XGboost--Step by Step to Use GridSearch

In [19]:
## 1. find the best value：n_estimators
#  cv_params = {'n_estimators': [550, 575, 600, 650, 675]}
#     other_params = {'learning_rate': 0.1, 'n_estimators': 600, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

# initiate parameters 
cv_params = {'n_estimators': [2200,2250,2300,2350,2400,2450]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2300, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
               'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

# use the GridSearch find the best value
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=-1)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_

# print the result 
print('final running result:{0}'.format(evalute_result))
print('best value：{0}'.format(optimized_GBM.best_params_))
print('best model score:{0}'.format(optimized_GBM.best_score_))

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  6.5min finished


每轮迭代运行结果:[mean: -16.22691, std: 1.24356, params: {'n_estimators': 2200}, mean: -16.23628, std: 1.23540, params: {'n_estimators': 2250}, mean: -16.21842, std: 1.21813, params: {'n_estimators': 2300}, mean: -16.22455, std: 1.22429, params: {'n_estimators': 2350}, mean: -16.22764, std: 1.23199, params: {'n_estimators': 2400}, mean: -16.22455, std: 1.23290, params: {'n_estimators': 2450}]
参数的最佳取值：{'n_estimators': 2300}
最佳模型得分:-16.21842076036648




In [9]:
## 2.find the best values: min_child_weight and max_depth
# initiate parameters
cv_params = {'max_depth': [15,16,17,18,19], 'min_child_weight': [1, 2, 3]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2300, 'max_depth': 17, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

# use the GridSearch find the best value
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_

# print the result 
print('final running result:{0}'.format(evalute_result))
print('best value：{0}'.format(optimized_GBM.best_params_))
print('best model score:{0}'.format(optimized_GBM.best_score_))

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  8.1min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed: 30.3min finished


每轮迭代运行结果:[mean: -15.27619, std: 1.37786, params: {'max_depth': 15, 'min_child_weight': 1}, mean: -15.48134, std: 1.49105, params: {'max_depth': 15, 'min_child_weight': 2}, mean: -15.75628, std: 1.41771, params: {'max_depth': 15, 'min_child_weight': 3}, mean: -15.30299, std: 1.42983, params: {'max_depth': 16, 'min_child_weight': 1}, mean: -15.50522, std: 1.52878, params: {'max_depth': 16, 'min_child_weight': 2}, mean: -15.63908, std: 1.38597, params: {'max_depth': 16, 'min_child_weight': 3}, mean: -15.17177, std: 1.35004, params: {'max_depth': 17, 'min_child_weight': 1}, mean: -15.53803, std: 1.49080, params: {'max_depth': 17, 'min_child_weight': 2}, mean: -15.68627, std: 1.44307, params: {'max_depth': 17, 'min_child_weight': 3}, mean: -15.23814, std: 1.36516, params: {'max_depth': 18, 'min_child_weight': 1}, mean: -15.52335, std: 1.35798, params: {'max_depth': 18, 'min_child_weight': 2}, mean: -15.70196, std: 1.36968, params: {'max_depth': 18, 'min_child_weight': 3}, mean: -15.30348, s



In [12]:
## 3. finding the best value：gamma
cv_params = {'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2300, 'max_depth': 17, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

# use the GridSearch find the best value
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_

# print the result 
print('final running result:{0}'.format(evalute_result))
print('best value：{0}'.format(optimized_GBM.best_params_))
print('best model score:{0}'.format(optimized_GBM.best_score_))


Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 22.5min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed: 30.1min finished


每轮迭代运行结果:[mean: -15.18117, std: 1.39399, params: {'gamma': 0.1}, mean: -15.16770, std: 1.31661, params: {'gamma': 0.2}, mean: -15.20673, std: 1.41309, params: {'gamma': 0.3}, mean: -15.22961, std: 1.36560, params: {'gamma': 0.4}, mean: -15.25166, std: 1.40444, params: {'gamma': 0.5}, mean: -15.28326, std: 1.37211, params: {'gamma': 0.6}]
参数的最佳取值：{'gamma': 0.2}
最佳模型得分:-15.167704314669633




In [13]:
## 4、finding the best values: subsample and colsample_bytree
cv_params = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2300, 'max_depth': 17, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}

# use the GridSearch find the best value
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_

# print the result 
print('final running result:{0}'.format(evalute_result))
print('best value：{0}'.format(optimized_GBM.best_params_))
print('best model score:{0}'.format(optimized_GBM.best_score_))


Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 13.5min
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed: 66.1min finished


每轮迭代运行结果:[mean: -16.11430, std: 1.38107, params: {'colsample_bytree': 0.6, 'subsample': 0.6}, mean: -15.98434, std: 1.44508, params: {'colsample_bytree': 0.6, 'subsample': 0.7}, mean: -16.10569, std: 1.37369, params: {'colsample_bytree': 0.6, 'subsample': 0.8}, mean: -16.11512, std: 1.28199, params: {'colsample_bytree': 0.6, 'subsample': 0.9}, mean: -15.54815, std: 1.48717, params: {'colsample_bytree': 0.7, 'subsample': 0.6}, mean: -15.63538, std: 1.51108, params: {'colsample_bytree': 0.7, 'subsample': 0.7}, mean: -15.40615, std: 1.26059, params: {'colsample_bytree': 0.7, 'subsample': 0.8}, mean: -15.71203, std: 1.41871, params: {'colsample_bytree': 0.7, 'subsample': 0.9}, mean: -15.34967, std: 1.47089, params: {'colsample_bytree': 0.8, 'subsample': 0.6}, mean: -15.39851, std: 1.56859, params: {'colsample_bytree': 0.8, 'subsample': 0.7}, mean: -15.16770, std: 1.31661, params: {'colsample_bytree': 0.8, 'subsample': 0.8}, mean: -15.50576, std: 1.58135, params: {'colsample_bytree': 0.8, '



In [16]:
## 5、finding the best value：reg_alpha and reg_lambda：
cv_params = {'reg_alpha': [0.03, 0.04, 0.05, 0.06, 0.07], 'reg_lambda': [ 1, 2, 3, 4, 5]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2300, 'max_depth': 17, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0.05, 'reg_lambda': 3}

# use the GridSearch find the best value
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_

# print the result 
print('final running result:{0}'.format(evalute_result))
print('best value：{0}'.format(optimized_GBM.best_params_))
print('best model score:{0}'.format(optimized_GBM.best_score_))


Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 18.9min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 79.9min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 103.2min finished


每轮迭代运行结果:[mean: -15.29405, std: 1.33961, params: {'reg_alpha': 0.03, 'reg_lambda': 1}, mean: -14.57189, std: 1.50178, params: {'reg_alpha': 0.03, 'reg_lambda': 2}, mean: -14.37725, std: 1.39864, params: {'reg_alpha': 0.03, 'reg_lambda': 3}, mean: -14.45982, std: 1.28773, params: {'reg_alpha': 0.03, 'reg_lambda': 4}, mean: -14.44591, std: 1.38203, params: {'reg_alpha': 0.03, 'reg_lambda': 5}, mean: -15.28520, std: 1.39825, params: {'reg_alpha': 0.04, 'reg_lambda': 1}, mean: -14.63542, std: 1.45204, params: {'reg_alpha': 0.04, 'reg_lambda': 2}, mean: -14.40583, std: 1.42825, params: {'reg_alpha': 0.04, 'reg_lambda': 3}, mean: -14.52798, std: 1.26264, params: {'reg_alpha': 0.04, 'reg_lambda': 4}, mean: -14.43247, std: 1.37781, params: {'reg_alpha': 0.04, 'reg_lambda': 5}, mean: -15.24558, std: 1.33713, params: {'reg_alpha': 0.05, 'reg_lambda': 1}, mean: -14.64144, std: 1.40620, params: {'reg_alpha': 0.05, 'reg_lambda': 2}, mean: -14.29069, std: 1.34953, params: {'reg_alpha': 0.05, 'reg_la



In [17]:
## 6、finding the best value: learning_rate (choose some small values)
cv_params = {'learning_rate': [0.01,0.02, 0.03,0.04]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2300, 'max_depth': 17, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0.05, 'reg_lambda':3}

# use the GridSearch find the best value
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.grid_scores_

# print the result 
print('final running result:{0}'.format(evalute_result))
print('best value：{0}'.format(optimized_GBM.best_params_))
print('best model score:{0}'.format(optimized_GBM.best_score_))


Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 18.9min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed: 28.9min finished


每轮迭代运行结果:[mean: -14.11122, std: 1.32832, params: {'learning_rate': 0.01}, mean: -14.31344, std: 1.37052, params: {'learning_rate': 0.03}, mean: -14.23997, std: 1.29115, params: {'learning_rate': 0.05}, mean: -14.34551, std: 1.35486, params: {'learning_rate': 0.07}, mean: -14.29069, std: 1.34953, params: {'learning_rate': 0.1}, mean: -14.77803, std: 1.42986, params: {'learning_rate': 0.2}]
参数的最佳取值：{'learning_rate': 0.01}
最佳模型得分:-14.111222797089477




## XGboost--Step by Step to Use GridSearch--Prediction and Output

In [8]:
# using best parameters got by the above Step by Step GridSearch to fit the model 
model = xgb.XGBRegressor(learning_rate=0.06, n_estimators=280, max_depth=20, min_child_weight=1, seed=0,
                             subsample=0.8, colsample_bytree=0.9, gamma=0.3, reg_alpha=5, reg_lambda=2)
model.fit(X_train, y_train)
y_pred = model.predict(test_workday_temp)


# (id, speed prediction) is the formate we need to save into result.csv file
test_workday_temp['speed'] = y_pred
day_pred = test_workday_temp['speed']
df_test = ts_temp.join(day_pred)
df_test = df_test.set_index('id')


df_test.to_csv("test.csv", columns=["speed"], index=True)

## LightGBM--Step by Step to Use GridSearch

In [8]:
# 1. finding the best value: n_estimator 
params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression', 
    'learning_rate': 0.1, 
    'num_leaves': 50, 
    'max_depth': 6,
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
}

data_train = lgb.Dataset(X_train, y_train, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

In [9]:
params_test1={
    'max_depth': range(14,30,2),
    'num_leaves':range(50, 230, 30)
}

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=991, max_depth=17,
                              metric='neg_mean_squared_error', bagging_fraction = 0.8,feature_fraction = 0.8)


gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch1.fit(X_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

#{'max_depth': 11, 'num_leaves': 50},-14.61332354663911)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 13.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 30.8min
[Parallel(n_jobs=4)]: Done 480 out of 480 | elapsed: 34.0min finished


([mean: -14.53073, std: 1.19704, params: {'max_depth': 14, 'num_leaves': 50},
  mean: -14.74203, std: 1.15113, params: {'max_depth': 14, 'num_leaves': 80},
  mean: -15.06348, std: 1.24828, params: {'max_depth': 14, 'num_leaves': 110},
  mean: -15.27329, std: 1.18577, params: {'max_depth': 14, 'num_leaves': 140},
  mean: -15.66745, std: 1.27811, params: {'max_depth': 14, 'num_leaves': 170},
  mean: -15.89216, std: 1.29410, params: {'max_depth': 14, 'num_leaves': 200},
  mean: -14.53524, std: 1.22472, params: {'max_depth': 16, 'num_leaves': 50},
  mean: -14.65723, std: 1.20361, params: {'max_depth': 16, 'num_leaves': 80},
  mean: -14.96363, std: 1.24170, params: {'max_depth': 16, 'num_leaves': 110},
  mean: -15.24224, std: 1.19603, params: {'max_depth': 16, 'num_leaves': 140},
  mean: -15.63958, std: 1.21606, params: {'max_depth': 16, 'num_leaves': 170},
  mean: -15.87957, std: 1.26570, params: {'max_depth': 16, 'num_leaves': 200},
  mean: -14.59488, std: 1.27731, params: {'max_depth': 1

In [10]:
params_test2={
    'max_depth': [9,11,13,15,17,19,21,23],
    'num_leaves':[30,50,70,110,150]
}

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=991, max_depth=14, 
                              metric='neg_mean_squared_error', bagging_fraction = 0.8, feature_fraction = 0.8)


gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch2.fit(X_train, y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_


Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  8.4min
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed: 18.4min finished


([mean: -15.15084, std: 1.26589, params: {'max_depth': 9, 'num_leaves': 30},
  mean: -14.81834, std: 1.19774, params: {'max_depth': 9, 'num_leaves': 50},
  mean: -14.87614, std: 1.18962, params: {'max_depth': 9, 'num_leaves': 70},
  mean: -15.18700, std: 1.22639, params: {'max_depth': 9, 'num_leaves': 110},
  mean: -15.32749, std: 1.20843, params: {'max_depth': 9, 'num_leaves': 150},
  mean: -14.92469, std: 1.39371, params: {'max_depth': 11, 'num_leaves': 30},
  mean: -14.63806, std: 1.17701, params: {'max_depth': 11, 'num_leaves': 50},
  mean: -14.72325, std: 1.23076, params: {'max_depth': 11, 'num_leaves': 70},
  mean: -15.11006, std: 1.32815, params: {'max_depth': 11, 'num_leaves': 110},
  mean: -15.47834, std: 1.26172, params: {'max_depth': 11, 'num_leaves': 150},
  mean: -14.94421, std: 1.29532, params: {'max_depth': 13, 'num_leaves': 30},
  mean: -14.64094, std: 1.18382, params: {'max_depth': 13, 'num_leaves': 50},
  mean: -14.66776, std: 1.26604, params: {'max_depth': 13, 'num_l

In [13]:
params_test3={
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight':[0.001, 0.002]
}


model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=991, max_depth=17, 
                              metric='neg_mean_squared_error', bagging_fraction = 0.8, feature_fraction = 0.8)


gsearch3 = GridSearchCV(estimator=model_lgb, param_grid=params_test3, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch3.fit(X_train, y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
#{'min_child_samples': 20, 'min_child_weight': 0.001}, -15.94139413891246)
# {'min_child_samples': 18, 'min_child_weight': 0.001},-14.996485526598791)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  3.2min finished


([mean: -14.40757, std: 1.21543, params: {'min_child_samples': 18, 'min_child_weight': 0.001},
  mean: -14.40757, std: 1.21543, params: {'min_child_samples': 18, 'min_child_weight': 0.002},
  mean: -14.51272, std: 1.23171, params: {'min_child_samples': 19, 'min_child_weight': 0.001},
  mean: -14.51272, std: 1.23171, params: {'min_child_samples': 19, 'min_child_weight': 0.002},
  mean: -14.50948, std: 1.24941, params: {'min_child_samples': 20, 'min_child_weight': 0.001},
  mean: -14.50948, std: 1.24941, params: {'min_child_samples': 20, 'min_child_weight': 0.002},
  mean: -14.62122, std: 1.33035, params: {'min_child_samples': 21, 'min_child_weight': 0.001},
  mean: -14.62122, std: 1.33035, params: {'min_child_samples': 21, 'min_child_weight': 0.002},
  mean: -14.62197, std: 1.24161, params: {'min_child_samples': 22, 'min_child_weight': 0.001},
  mean: -14.62197, std: 1.24161, params: {'min_child_samples': 22, 'min_child_weight': 0.002}],
 {'min_child_samples': 18, 'min_child_weight': 0.

In [12]:
params_test4={
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}


model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=991, max_depth=17, 
                              metric='neg_mean_squared_error', bagging_freq = 5,  min_child_samples=18)


gsearch4 = GridSearchCV(estimator=model_lgb, param_grid=params_test4, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch4.fit(X_train, y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
#{'bagging_fraction': 1.0, 'feature_fraction': 0.8}, -15.94139413891246)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  9.0min finished


([mean: -15.95353, std: 1.33075, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.5},
  mean: -15.41354, std: 1.33284, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.6},
  mean: -15.31771, std: 1.29155, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.7},
  mean: -15.46244, std: 1.32509, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.8},
  mean: -15.61412, std: 1.48854, params: {'bagging_fraction': 0.6, 'feature_fraction': 0.9},
  mean: -15.53017, std: 1.18082, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.5},
  mean: -15.10075, std: 1.21634, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.6},
  mean: -15.08403, std: 1.25208, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.7},
  mean: -15.17861, std: 1.24685, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.8},
  mean: -15.23642, std: 1.40932, params: {'bagging_fraction': 0.7, 'feature_fraction': 0.9},
  mean: -15.17903, std: 1.17913, params: {'bagging_fraction': 0.8, 'fe

In [14]:
params_test5={
    'feature_fraction': [ 0.78, 0.79, 0.80,0.81,0.82 ]
}


model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=991, max_depth=17, bagging_fraction = 1.0, 
                              metric='neg_mean_squared_error', bagging_freq = 5,  min_child_samples=18)


gsearch5 = GridSearchCV(estimator=model_lgb, param_grid=params_test5, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch5.fit(X_train, y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
#{'feature_fraction': 0.78},-15.94139413891246)
# {'feature_fraction': 0.77},-15.94139413891246)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.5min finished


([mean: -14.40757, std: 1.21543, params: {'feature_fraction': 0.78},
  mean: -14.40757, std: 1.21543, params: {'feature_fraction': 0.79},
  mean: -14.40757, std: 1.21543, params: {'feature_fraction': 0.8},
  mean: -14.62531, std: 1.30434, params: {'feature_fraction': 0.81},
  mean: -14.62531, std: 1.30434, params: {'feature_fraction': 0.82}],
 {'feature_fraction': 0.78},
 -14.407574206002023)

In [15]:
#Step5: regularization
params_test6={
    'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
}


model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=991, max_depth=17, bagging_fraction = 1.0, 
                              metric='neg_mean_squared_error', bagging_freq = 5,  min_child_samples=18, feature_fraction=0.78)


gsearch6 = GridSearchCV(estimator=model_lgb, param_grid=params_test6, scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch6.fit(X_train, y_train)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 14.4min
[Parallel(n_jobs=4)]: Done 490 out of 490 | elapsed: 16.0min finished


([mean: -14.40757, std: 1.21543, params: {'reg_alpha': 0, 'reg_lambda': 0},
  mean: -14.47889, std: 1.26984, params: {'reg_alpha': 0, 'reg_lambda': 0.001},
  mean: -14.52121, std: 1.28101, params: {'reg_alpha': 0, 'reg_lambda': 0.01},
  mean: -14.52836, std: 1.28210, params: {'reg_alpha': 0, 'reg_lambda': 0.03},
  mean: -14.50836, std: 1.17809, params: {'reg_alpha': 0, 'reg_lambda': 0.08},
  mean: -14.55525, std: 1.25853, params: {'reg_alpha': 0, 'reg_lambda': 0.3},
  mean: -14.57372, std: 1.36302, params: {'reg_alpha': 0, 'reg_lambda': 0.5},
  mean: -14.49904, std: 1.29032, params: {'reg_alpha': 0.001, 'reg_lambda': 0},
  mean: -14.48399, std: 1.29077, params: {'reg_alpha': 0.001, 'reg_lambda': 0.001},
  mean: -14.55460, std: 1.22174, params: {'reg_alpha': 0.001, 'reg_lambda': 0.01},
  mean: -14.58423, std: 1.24691, params: {'reg_alpha': 0.001, 'reg_lambda': 0.03},
  mean: -14.49225, std: 1.23125, params: {'reg_alpha': 0.001, 'reg_lambda': 0.08},
  mean: -14.49639, std: 1.18859, param

In [16]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression', 

    'learning_rate': 0.003, 
    'num_leaves': 50, 
    'max_depth': 17,
    'n_estimators':991,
    'bagging_fraction':1.0,
    'min_child_sample':18,
    'feature_fraction':0.78 
    }

data_train = lgb.Dataset(X_train, y_train, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=100, show_stdv=True)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 18
You can set 

## LightGBM--Step by Step to Use GridSearch--Prediction and Output

In [8]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.005,
    'feature_fraction': 0.78,
    'bagging_fraction': 1.0,
    'bagging_freq': 5,
    "max_depth": 17,
    "num_leaves": 50,  
    "n_estimators": 991
}

gbm = lgb.LGBMRegressor(**hyper_params)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='neg_mean_squared_error', early_stopping_rounds=1000)
y_pred = gbm.predict(test_workday_temp, num_iteration=gbm.best_iteration_)


test_workday_temp['speed'] = y_pred
day_pred = test_workday_temp['speed']
df_test = ts_temp.join(day_pred)
df_test = df_test.set_index('id')

df_test.to_csv("test.csv", columns=["speed"], index=True)



[1]	valid_0's rmse: 13.5273
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's rmse: 13.4764
[3]	valid_0's rmse: 13.4208
[4]	valid_0's rmse: 13.3659
[5]	valid_0's rmse: 13.311
[6]	valid_0's rmse: 13.2565
[7]	valid_0's rmse: 13.2019
[8]	valid_0's rmse: 13.1476
[9]	valid_0's rmse: 13.0941
[10]	valid_0's rmse: 13.045
[11]	valid_0's rmse: 12.9915
[12]	valid_0's rmse: 12.9383
[13]	valid_0's rmse: 12.8855
[14]	valid_0's rmse: 12.8335
[15]	valid_0's rmse: 12.7818
[16]	valid_0's rmse: 12.7299
[17]	valid_0's rmse: 12.6787
[18]	valid_0's rmse: 12.6315
[19]	valid_0's rmse: 12.5853
[20]	valid_0's rmse: 12.5351
[21]	valid_0's rmse: 12.4849
[22]	valid_0's rmse: 12.4354
[23]	valid_0's rmse: 12.3893
[24]	valid_0's rmse: 12.3401
[25]	valid_0's rmse: 12.2952
[26]	valid_0's rmse: 12.2467
[27]	valid_0's rmse: 12.1978
[28]	valid_0's rmse: 12.1495
[29]	valid_0's rmse: 12.1018
[30]	valid_0's rmse: 12.0541
[31]	valid_0's rmse: 12.0104
[32]	valid_0's rmse: 11.9632
[33]	valid_0's rmse:

[266]	valid_0's rmse: 6.27081
[267]	valid_0's rmse: 6.26104
[268]	valid_0's rmse: 6.2512
[269]	valid_0's rmse: 6.24164
[270]	valid_0's rmse: 6.23191
[271]	valid_0's rmse: 6.22136
[272]	valid_0's rmse: 6.21179
[273]	valid_0's rmse: 6.20202
[274]	valid_0's rmse: 6.19129
[275]	valid_0's rmse: 6.18183
[276]	valid_0's rmse: 6.17273
[277]	valid_0's rmse: 6.16229
[278]	valid_0's rmse: 6.15305
[279]	valid_0's rmse: 6.14374
[280]	valid_0's rmse: 6.13507
[281]	valid_0's rmse: 6.12529
[282]	valid_0's rmse: 6.11649
[283]	valid_0's rmse: 6.10784
[284]	valid_0's rmse: 6.0991
[285]	valid_0's rmse: 6.09047
[286]	valid_0's rmse: 6.08111
[287]	valid_0's rmse: 6.07251
[288]	valid_0's rmse: 6.06399
[289]	valid_0's rmse: 6.05573
[290]	valid_0's rmse: 6.04714
[291]	valid_0's rmse: 6.03894
[292]	valid_0's rmse: 6.03075
[293]	valid_0's rmse: 6.02257
[294]	valid_0's rmse: 6.01446
[295]	valid_0's rmse: 6.00648
[296]	valid_0's rmse: 5.99847
[297]	valid_0's rmse: 5.98965
[298]	valid_0's rmse: 5.98105
[299]	valid_

[557]	valid_0's rmse: 4.93808
[558]	valid_0's rmse: 4.93555
[559]	valid_0's rmse: 4.93425
[560]	valid_0's rmse: 4.93142
[561]	valid_0's rmse: 4.93005
[562]	valid_0's rmse: 4.9281
[563]	valid_0's rmse: 4.92687
[564]	valid_0's rmse: 4.92407
[565]	valid_0's rmse: 4.92284
[566]	valid_0's rmse: 4.92088
[567]	valid_0's rmse: 4.91976
[568]	valid_0's rmse: 4.91787
[569]	valid_0's rmse: 4.9147
[570]	valid_0's rmse: 4.91158
[571]	valid_0's rmse: 4.91031
[572]	valid_0's rmse: 4.909
[573]	valid_0's rmse: 4.90785
[574]	valid_0's rmse: 4.90614
[575]	valid_0's rmse: 4.90305
[576]	valid_0's rmse: 4.90182
[577]	valid_0's rmse: 4.89876
[578]	valid_0's rmse: 4.89639
[579]	valid_0's rmse: 4.8945
[580]	valid_0's rmse: 4.89289
[581]	valid_0's rmse: 4.89166
[582]	valid_0's rmse: 4.89046
[583]	valid_0's rmse: 4.8875
[584]	valid_0's rmse: 4.88623
[585]	valid_0's rmse: 4.88516
[586]	valid_0's rmse: 4.88283
[587]	valid_0's rmse: 4.88045
[588]	valid_0's rmse: 4.87901
[589]	valid_0's rmse: 4.87795
[590]	valid_0's 

[871]	valid_0's rmse: 4.56784
[872]	valid_0's rmse: 4.56648
[873]	valid_0's rmse: 4.56498
[874]	valid_0's rmse: 4.5643
[875]	valid_0's rmse: 4.56315
[876]	valid_0's rmse: 4.56238
[877]	valid_0's rmse: 4.56124
[878]	valid_0's rmse: 4.55962
[879]	valid_0's rmse: 4.5589
[880]	valid_0's rmse: 4.55776
[881]	valid_0's rmse: 4.55663
[882]	valid_0's rmse: 4.55554
[883]	valid_0's rmse: 4.55438
[884]	valid_0's rmse: 4.55267
[885]	valid_0's rmse: 4.55123
[886]	valid_0's rmse: 4.55063
[887]	valid_0's rmse: 4.54947
[888]	valid_0's rmse: 4.54868
[889]	valid_0's rmse: 4.5478
[890]	valid_0's rmse: 4.54682
[891]	valid_0's rmse: 4.54625
[892]	valid_0's rmse: 4.54533
[893]	valid_0's rmse: 4.54392
[894]	valid_0's rmse: 4.54265
[895]	valid_0's rmse: 4.54183
[896]	valid_0's rmse: 4.54075
[897]	valid_0's rmse: 4.54027
[898]	valid_0's rmse: 4.53888
[899]	valid_0's rmse: 4.53841
[900]	valid_0's rmse: 4.53704
[901]	valid_0's rmse: 4.53669
[902]	valid_0's rmse: 4.53639
[903]	valid_0's rmse: 4.53608
[904]	valid_0