In [105]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [106]:
data = pd.read_csv("feature_eng_cat.csv")

In [107]:
X = data.drop(['LEN',"Unnamed: 0"],axis=1)
y = data['LEN']

In [126]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()
one_hot.fit(X[[ 'MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT']].values)

one_hot_output = one_hot.transform(X[['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT']].values)


columns=[f"MON_{i}" for i in range(1,13)] + [f"DAT_{i}" for i in range(1,32)] + [f"HR_{i}" for i in range(24)] + [f"WK_{i}" for i in range(7)] + [f"TAXI_{i}" for i in range(442)] + [f"STAND_{i}" for i in range(64)] + [f"CALL_{i}" for i in range(254)] 

In [127]:
one_hot_frame = pd.DataFrame(one_hot_output.toarray(),columns=columns)
df_processed = X.join(one_hot_frame)

In [128]:
df_processed.drop(['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT'],axis=1,inplace=True)

In [129]:
x_train,x_test,y_train,y_test = train_test_split(df_processed,y,test_size=0.2)

In [130]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

## Start training

In [131]:
params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',  # 多分类的问题
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.1,                  # 如同学习率
    'seed': 1000,
    'nthread': 8,                  # cpu 线程数
}

In [132]:
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    early_stopping_rounds=10,
    evals=[(dtest,'eval')]
)

Parameters: { "silent" } are not used.

[0]	eval-rmse:947.05415
[1]	eval-rmse:899.62740
[2]	eval-rmse:858.92524
[3]	eval-rmse:824.75130
[4]	eval-rmse:795.50083
[5]	eval-rmse:771.23117
[6]	eval-rmse:750.76635
[7]	eval-rmse:733.43299
[8]	eval-rmse:719.71734
[9]	eval-rmse:708.05304
[10]	eval-rmse:698.51835
[11]	eval-rmse:690.03607
[12]	eval-rmse:683.50554
[13]	eval-rmse:677.54501
[14]	eval-rmse:673.08173
[15]	eval-rmse:668.90543
[16]	eval-rmse:665.90788
[17]	eval-rmse:663.03284
[18]	eval-rmse:660.69239
[19]	eval-rmse:658.74315
[20]	eval-rmse:657.15114
[21]	eval-rmse:655.96747
[22]	eval-rmse:654.59440
[23]	eval-rmse:653.59669
[24]	eval-rmse:652.98516
[25]	eval-rmse:652.36481
[26]	eval-rmse:651.79847
[27]	eval-rmse:651.31515
[28]	eval-rmse:650.83162
[29]	eval-rmse:650.40849
[30]	eval-rmse:650.01207
[31]	eval-rmse:649.74520
[32]	eval-rmse:649.59912
[33]	eval-rmse:649.40930
[34]	eval-rmse:649.24375
[35]	eval-rmse:649.05389
[36]	eval-rmse:648.84439
[37]	eval-rmse:648.73033
[38]	eval-rmse:648.6

In [133]:
from sklearn.metrics import mean_squared_error
ypred = bst.predict(dtest)
mse_value = mean_squared_error(y_test, ypred)
print(f"RMSE = {np.sqrt(mse_value)}")

RMSE = 641.9264884546042


* Load test data

In [134]:
test_data = pd.read_csv("test_public_features_cat.csv")
# one_hot_output = one_hot.transform(test_data[['TAXI_ID_CAT','ORIGIN_STAND_CAT']].values)
# columns=[f"TAXI_{i}" for i in range(448)] + [f"STAND_{i}" for i in range(64)]

one_hot_output = one_hot.transform(X[['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT']].values)
one_hot_frame = pd.DataFrame(one_hot_output.toarray(),columns=columns)

test_data = test_data.join(one_hot_frame)

# test_data.drop(['TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT'],axis=1,inplace=True)
test_data.drop(['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT'],axis=1,inplace=True)

In [135]:
test_data = test_data.drop(['Unnamed: 0'],axis=1)
dtest_public = xgb.DMatrix(test_data)

# predict the results
ypred_result = bst.predict(dtest_public)

In [138]:
sample = pd.read_csv("sample_xgboost.csv",index_col="TRIP_ID")

sample['TRAVEL_TIME'] = ypred_result

In [139]:
sample.to_csv("sample_xgboost.csv")