In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("modeling_data.csv")

In [6]:
df

Unnamed: 0.1,Unnamed: 0,fare_amount,passenger_count,log_date,log_time,log_hour,log_min,log_year,log_month,log_day,7to15,16to19,20to6,dayofweek,weekday,is_extracharge,taxi_amount,dist
0,0,4.5,1,2009-06-15,17:26:21,17,26,2009,6,15,0,1,0,1,1,1,1,412.061196
1,1,16.9,1,2010-01-05,16:52:16,16,52,2010,1,5,0,1,0,2,1,1,1,4646.229208
2,2,5.7,2,2011-08-18,00:35:00,0,35,2011,8,18,0,0,1,4,1,1,1,1004.797031
3,3,7.7,1,2012-04-21,04:30:42,4,30,2012,4,21,0,0,1,6,0,1,1,914.153776
4,4,5.3,1,2010-03-09,07:51:00,7,51,2010,3,9,1,0,0,2,1,0,1,1366.163592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,5.7,5,2011-01-14,13:55:00,13,55,2011,1,14,1,0,0,5,1,0,2,870.509800
999996,999996,10.5,1,2012-07-20,12:59:16,12,59,2012,7,20,1,0,0,5,1,0,1,2277.239678
999997,999997,6.5,1,2013-07-01,09:10:00,9,10,2013,7,1,1,0,0,1,1,0,1,628.284955
999998,999998,17.5,3,2013-09-24,23:28:00,23,28,2013,9,24,0,0,1,2,1,1,1,2803.616297


In [3]:
df.drop(['Unnamed: 0', 'log_date','log_time','log_hour','log_min','log_year','log_month','log_day'], axis=1, inplace=True)

In [4]:
df

Unnamed: 0,fare_amount,passenger_count,7to15,16to19,20to6,dayofweek,weekday,is_extracharge,taxi_amount,dist
0,4.5,1,0,1,0,1,1,1,1,412.061196
1,16.9,1,0,1,0,2,1,1,1,4646.229208
2,5.7,2,0,0,1,4,1,1,1,1004.797031
3,7.7,1,0,0,1,6,0,1,1,914.153776
4,5.3,1,1,0,0,2,1,0,1,1366.163592
...,...,...,...,...,...,...,...,...,...,...
999995,5.7,5,1,0,0,5,1,0,2,870.509800
999996,10.5,1,1,0,0,5,1,0,1,2277.239678
999997,6.5,1,1,0,0,1,1,0,1,628.284955
999998,17.5,3,0,0,1,2,1,1,1,2803.616297


In [5]:
X = df[['passenger_count','7to15','16to19','20to6','dayofweek','weekday','is_extracharge','taxi_amount','dist']]
y = df['fare_amount']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=34)

## RandomForest

#### 하이퍼파라미터 튜닝 없는 기본 모델

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
# 모델 생성
model = RandomForestRegressor()
# 모델 학습
model.fit(X_train, y_train)

RandomForestRegressor()

In [27]:
y_pred = model.predict(X_valid)

In [28]:
# RMSE 지표로 성능 평가
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)

In [29]:
# 검증 세트 RMSE
rmse

5.9799674428630585

#### RandomizedSearhCV 통해 하이퍼파라미터 튜닝

In [30]:
from sklearn.model_selection import RandomizedSearchCV

rf_param = {"n_estimators": list(range(10, 100, 10)),
            "max_depth": [3,5,10,15],
            "max_features": [3, 5, 7, 9],
            "min_samples_split": [3, 5, 7, 9]}

rf_model = RandomForestRegressor()

rf_random_search = RandomizedSearchCV(estimator = rf_model,
                                     param_distributions = rf_param,
                                     scoring='neg_mean_squared_error',
                                     n_jobs=-1,
                                     verbose=1,
                                     cv=5,
                                     random_state=34)

In [31]:
rf_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': [3, 5, 10, 15],
                                        'max_features': [3, 5, 7, 9],
                                        'min_samples_split': [3, 5, 7, 9],
                                        'n_estimators': [10, 20, 30, 40, 50, 60,
                                                         70, 80, 90]},
                   random_state=34, scoring='neg_mean_squared_error',
                   verbose=1)

In [33]:
# 최적 하이퍼파라미터
rf_random_search.best_params_

{'n_estimators': 30,
 'min_samples_split': 5,
 'max_features': 7,
 'max_depth': 10}

In [34]:
train_pred = rf_random_search.predict(X_train)
valid_pred  = rf_random_search.predict(X_valid)

In [36]:
# RMSE 지표로 성능 평가
print("RandomForest 훈련 세트 RMSE: {:.3f}".format(np.sqrt(mean_squared_error(y_train, train_pred))))
print("RandomForest 검증 세트 RMSE: {:.3f}".format(np.sqrt(mean_squared_error(y_valid, valid_pred))))

RandomForest 훈련 세트 RMSE: 5.289
RandomForest 검증 세트 RMSE: 5.390


## XGBoost

In [38]:
import platform
platform.architecture()

('64bit', 'WindowsPE')

In [39]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.5.2-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2


In [15]:
import xgboost as xgb

In [16]:
from xgboost import XGBRegressor

In [44]:
xgb_model = XGBRegressor(objective='reg:squarederror', #오차 제곱
                         n_estimators=500,
                         max_depth=5,
                         learning_rate=0.1,
                         n_jobs = -1,
                         random_state=34)
xgb_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=34,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
train_pred = xgb_model.predict(X_train)
valid_pred = xgb_model.predict(X_valid)

In [46]:
print("XGBoost 훈련 세트 정확도: {:.3f}".format(np.sqrt(mean_squared_error(train_pred, y_train))))
print("XGBoost 테스트 세트 정확도: {:.3f}".format(np.sqrt(mean_squared_error(valid_pred, y_valid))))

XGBoost 훈련 세트 정확도: 5.129
XGBoost 테스트 세트 정확도: 5.440


파라미터 튜닝 가이드
https://psystat.tistory.com/131

In [19]:
from sklearn.model_selection import RandomizedSearchCV

xgb_model_ = xgb.XGBRegressor()

xgb_param_grid = {'max_depth': range(3,13), 
                  'subsample': np.linspace(0.4, 1, 7),
                  'min_child_weight':range(1,6,2),
                  'learning_rate': [0.01, 0.1, 0.2, 0.3],
                  'n_estimators' : [100, 200, 300, 400, 500],
                  'gamma': np.linspace(0, 0.5, 6),
                  'colsample_bytree': np.linspace(0.5, 1, 6)
                  }

xgb_random = RandomizedSearchCV(estimator = xgb_model_,
                                param_distributions = xgb_param_grid,
                                scoring='neg_mean_squared_error', 
                                n_jobs=-1, 
                                cv = 5)

In [20]:
xgb_random.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          validate_parameters=None,
                                          verbosity=None),
                   n_jobs=-1,
                   param_distribut

In [21]:
train_pred = xgb_random.predict(X_train)
valid_pred = xgb_random.predict(X_valid)

In [22]:
from sklearn.metrics import mean_squared_error

In [23]:
print("XGBoost 훈련 세트 정확도: {:.3f}".format(np.sqrt(mean_squared_error(train_pred, y_train))))
print("XGBoost 테스트 세트 정확도: {:.3f}".format(np.sqrt(mean_squared_error(valid_pred, y_valid))))

XGBoost 훈련 세트 정확도: 5.404
XGBoost 테스트 세트 정확도: 5.381


In [24]:
xgb_random.best_params_

{'subsample': 1.0,
 'n_estimators': 500,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.01,
 'gamma': 0.0,
 'colsample_bytree': 1.0}

아래는 궁금해서 해 본 것 ..

In [28]:
xgb_best = xgb.XGBRegressor(n_estimators = 500, 
                            max_depth = 5,
                            min_child_weight = 3,
                            subsample = 1,
                            learning_rate = 0.01,
                            gamma = 0,
                            colsample_bytree = 1,
                            n_jobs = -1)

In [29]:
xgb_best.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [30]:
best_train_pred = xgb_best.predict(X_train)
best_valid_pred = xgb_best.predict(X_valid)

In [31]:
print("XGBoost 훈련 세트 정확도: {:.3f}".format(np.sqrt(mean_squared_error(best_train_pred, y_train))))
print("XGBoost 테스트 세트 정확도: {:.3f}".format(np.sqrt(mean_squared_error(best_valid_pred, y_valid))))

XGBoost 훈련 세트 정확도: 5.404
XGBoost 테스트 세트 정확도: 5.381
