In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install optuna



In [3]:
import os
import pandas as pd
import numpy as np
import xlrd

from sklearn.metrics import mean_squared_error
from sklearn import ensemble

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

#### import files

In [4]:
# Import data
tada_eta = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/tada_eta.xlsx')

In [5]:
# "서울시 구별 차량 대수" 데이터를 추가로 활용하였습니다.
wb = xlrd.open_workbook('/content/drive/MyDrive/Colab Notebooks/data/서울시 자동차등록 (월별:구별) 통계_1721.xls')
ws = wb.sheet_by_index(0)

dic_vehs = {}
for i in range(3,28):
  dic_vehs.update({ws.cell_value(i,1):ws.cell_value(i,2)})

In [6]:
# "서울시 구별 차량 통행속도" 데이터를 추가로 활용하였습니다.
wb = xlrd.open_workbook('/content/drive/MyDrive/Colab Notebooks/data/서울시 차량통행속도(구:월별)_1721.xls')
ws = wb.sheet_by_index(0)

dic_g_spd = {}
for i in range(4,4+25):
  dic_g_spd.update({ws.cell_value(i,2):ws.cell_value(i,3)})

### preprocessing

In [7]:
# 기본 데이터에 서울시 구별 차량대수, 이동속도 데이터 추가
tada_eta['spd_gu'] = tada_eta['pickup_gu'].apply(lambda x: dic_g_spd[x])
tada_eta['veh_gu'] = tada_eta['pickup_gu'].apply(lambda x: dic_vehs[x])

In [8]:
# 구별 차량대수 스케일링
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
tada_eta['veh_gu'] = scaler.fit_transform(tada_eta['veh_gu'].to_numpy().reshape(-1,1))*10

In [9]:
tada_eta['distance'] = ((tada_eta['pickup_lat'] - tada_eta['driver_lat'])**2 + (tada_eta['pickup_lng']-tada_eta['driver_lng'])**2)*1e5
tada_eta2 = tada_eta.drop(['id','driver_id','created_at_kst','pickup_lng','pickup_lat','driver_lng','driver_lat','pickup_gu'], axis=1)

tada_eta2 = tada_eta2.sample(frac=1, random_state=0).reset_index(drop=True)
tada_eta2.head()

Unnamed: 0,ATA,api_eta,month,hour,spd_gu,veh_gu,distance
0,6.13,8.32,12,22,23.7,-5.280082,44.521977
1,5.37,3.05,12,11,22.9,13.007287,3.989066
2,6.27,5.0,7,19,22.6,-15.38016,4.814699
3,10.52,8.47,12,21,26.1,-11.279773,77.341173
4,12.45,6.65,7,12,23.7,11.821118,14.888851


In [11]:
lim = 12000
train = tada_eta2[:lim]
test = tada_eta2[lim:]

x_train = np.asarray(train.drop('ATA',1))
y_train = np.asarray(train['ATA'])
x_test = np.asarray(test.drop('ATA',1))
y_test = np.asarray(test['ATA'])

  """
  import sys


## OPTUNA


In [12]:
# GBR 하이퍼 파라미터들 값 지정
def objectiveHGB(trial: Trial, x_train, y_train, x_test, y_test):
    param = {
        'max_iter' : trial.suggest_int('max_iter', 100, 10000),
        'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 10, 10000),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 10, 5000),
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 1),
        #'categorical_features' : [1, 5,6,7,8,9],
        'random_state' : 0
    }
    
    # 학습 모델 생성
    model = ensemble.HistGradientBoostingRegressor(**param)
    hgb_model = model.fit(x_train, y_train) # 학습 진행
    
    # 모델 성능 확인
    score = mean_squared_error(hgb_model.predict(x_test), y_test)
    
    return score

In [13]:
# target : minimize MSE
# TPESampler : Sampler using TPE (Tree-structured Parzen Estimator) algorithm.
study = optuna.create_study(direction='minimize', sampler=TPESampler())

# set the "number of trials" to be 20 (n_trials=20)
study.optimize(lambda trial : objectiveHGB(trial, x_train, y_train, x_test, y_test), n_trials = 1000)

print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))
# Best trial : {'n_estimators': 3987, 'max_depth': 20, 'alpha': 0.0014571511031095652, 'subsample': 0.6}

[32m[I 2022-05-22 13:43:08,439][0m A new study created in memory with name: no-name-9566a947-c05d-479f-8a4e-d3bc26905ec1[0m
[32m[I 2022-05-22 13:43:08,702][0m Trial 0 finished with value: 10.248343521578038 and parameters: {'max_iter': 6560, 'max_leaf_nodes': 5421, 'min_samples_leaf': 4082, 'learning_rate': 0.9315544466879577}. Best is trial 0 with value: 10.248343521578038.[0m
[32m[I 2022-05-22 13:43:09,071][0m Trial 1 finished with value: 10.147771858459738 and parameters: {'max_iter': 447, 'max_leaf_nodes': 955, 'min_samples_leaf': 3932, 'learning_rate': 0.8741149892794607}. Best is trial 1 with value: 10.147771858459738.[0m
[32m[I 2022-05-22 13:43:09,154][0m Trial 2 finished with value: 8.456864764153039 and parameters: {'max_iter': 3315, 'max_leaf_nodes': 9111, 'min_samples_leaf': 1153, 'learning_rate': 0.9340780463743682}. Best is trial 2 with value: 8.456864764153039.[0m
[32m[I 2022-05-22 13:43:09,286][0m Trial 3 finished with value: 8.549413046902071 and parameter

Best trial : score 8.180573865465725, 
params {'max_iter': 2147, 'max_leaf_nodes': 5513, 'min_samples_leaf': 342, 'learning_rate': 0.15698245423657223}


---

In [15]:
#params = study.best_trial.params
best_mse = 10

if study.best_trial.value <= 8.19:
  while best_mse >= 8.19:
    reg = ensemble.HistGradientBoostingRegressor(**study.best_trial.params)

    reg.fit(x_train, y_train)
    mse = mean_squared_error(y_test, reg.predict(x_test))
    
    if mse < best_mse:
      best_mse = mse
      best_model = reg
else:
  for i in range(200):
    reg = ensemble.HistGradientBoostingRegressor(**study.best_trial.params)

    reg.fit(x_train, y_train)
    mse = mean_squared_error(y_test, reg.predict(x_test))
    
    if mse < best_mse:
      best_mse = mse
      best_model = reg

print("the mean squared error (MSE) on test set: {:.4f}".format(best_mse))
print("best_model: ", best_model)

the mean squared error (MSE) on test set: 8.1865
best_model:  HistGradientBoostingRegressor(learning_rate=0.15698245423657223, max_iter=2147,
                              max_leaf_nodes=5513, min_samples_leaf=342)
