In [1]:
import pandas as pd
import numpy as np
import scipy.stats as spst

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV

In [2]:
path = 'final_raw_data.csv'
data = pd.read_csv(path)

In [3]:
data.head()

Unnamed: 0,집계시,출발영업소코드,도착영업소코드,통행시간,요일,arrive_traffic,rain-1
0,15,101,140,24238,4,125,0
1,8,101,140,22762,4,125,0
2,7,101,140,23867,4,125,0
3,18,101,140,23196,4,125,0
4,7,101,140,23654,4,125,0


In [4]:
data.drop(columns=['출발영업소코드','도착영업소코드'],inplace=True)

In [5]:
data.head()

Unnamed: 0,집계시,통행시간,요일,arrive_traffic,rain-1
0,15,24238,4,125,0
1,8,22762,4,125,0
2,7,23867,4,125,0
3,18,23196,4,125,0
4,7,23654,4,125,0


In [6]:
data.columns=['time','timecost','weekday','arrive_traffic','rain-1']

In [7]:
data.head()

Unnamed: 0,time,timecost,weekday,arrive_traffic,rain-1
0,15,24238,4,125,0
1,8,22762,4,125,0
2,7,23867,4,125,0
3,18,23196,4,125,0
4,7,23654,4,125,0


In [8]:
data2 = pd.get_dummies(data,columns=['weekday','time'],drop_first=True)

In [9]:
target='timecost'

In [10]:
data2.head()

Unnamed: 0,timecost,arrive_traffic,rain-1,weekday_1,weekday_2,weekday_3,weekday_4,weekday_6,time_1,time_2,...,time_14,time_15,time_16,time_17,time_18,time_19,time_20,time_21,time_22,time_23
0,24238,125,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,22762,125,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23867,125,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23196,125,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,23654,125,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
for col in data.columns:
    print(col,spst.ttest_ind(data[target],data[col]))

time Ttest_indResult(statistic=90.47379616717113, pvalue=0.0)
timecost Ttest_indResult(statistic=0.0, pvalue=1.0)
weekday Ttest_indResult(statistic=90.51134655235914, pvalue=0.0)
arrive_traffic Ttest_indResult(statistic=90.14408684465968, pvalue=0.0)
rain-1 Ttest_indResult(statistic=90.52245988649328, pvalue=0.0)


In [12]:
spst.pearsonr(data[target],data['arrive_traffic'])

(0.10701453950026897, 0.038066369093932816)

In [25]:
y= data2[target]
x= data2.drop(target,axis=1)

In [28]:
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size = 3, shuffle = False)

In [30]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.fit_transform(x_val)

In [31]:
mR = RandomForestRegressor()
mS = SVR()
mX = XGBRegressor()

In [42]:
Rparams = {'max_depth': range(2,5)}
Sparams = {'kernel': ['rbf'], "C" : [.5,1,2,5],'gamma':[.5,.7,1.,1.5]}
Xparams = {'max_depth': range(2,4), 'learning_rate': [.01, .1, .2, .5]}

In [43]:
Rg = GridSearchCV(mR, Rparams, cv = 3 , verbose = 2)
Sg = GridSearchCV(mS, Sparams, cv = 3 , verbose = 2)
Xg = GridSearchCV(mX, Xparams, cv = 3 , verbose = 2)

In [62]:
Rg.fit(x_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ........................................max_depth=2; total time=   0.1s
[CV] END ........................................max_depth=2; total time=   0.0s
[CV] END ........................................max_depth=2; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.1s
[CV] END ........................................max_depth=4; total time=   0.1s
[CV] END ........................................max_depth=4; total time=   0.1s
[CV] END ........................................max_depth=4; total time=   0.0s


GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'max_depth': range(2, 5)}, verbose=2)

In [45]:
Sg.fit(x_train_s, y_train)


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END .......................C=0.5, gamma=0.5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.7, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.7, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.7, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=1.0, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=1.0, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=1.0, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=1.5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=1.5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=1.5

GridSearchCV(cv=3, estimator=SVR(),
             param_grid={'C': [0.5, 1, 2, 5], 'gamma': [0.5, 0.7, 1.0, 1.5],
                         'kernel': ['rbf']},
             verbose=2)

In [63]:
Xg.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END ....................learning_rate=0.01, max_depth=2; total time=   0.0s
[CV] END ....................learning_rate=0.01, max_depth=2; total time=   0.0s
[CV] END ....................learning_rate=0.01, max_depth=2; total time=   0.0s
[CV] END ....................learning_rate=0.01, max_depth=3; total time=   0.0s
[CV] END ....................learning_rate=0.01, max_depth=3; total time=   0.0s
[CV] END ....................learning_rate=0.01, max_depth=3; total time=   0.0s
[CV] END .....................learning_rate=0.1, max_depth=2; total time=   0.0s
[CV] END .....................learning_rate=0.1, max_depth=2; total time=   0.0s
[CV] END .....................learning_rate=0.1, max_depth=2; total time=   0.0s
[CV] END .....................learning_rate=0.1, max_depth=3; total time=   0.0s
[CV] END .....................learning_rate=0.1, max_depth=3; total time=   0.0s
[CV] END .....................learning_rate=0.1, 

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bin=None,
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,


In [64]:
Rg.best_params_

{'max_depth': 3}

In [65]:
Sg.best_params_

{'C': 5, 'gamma': 0.5, 'kernel': 'rbf'}

In [66]:
Xg.best_params_

{'learning_rate': 0.1, 'max_depth': 3}

In [70]:
modelR=RandomForestRegressor(max_depth= 3)
modelS=SVR(C=5, gamma= 0.5, kernel= 'rbf')
modelX=XGBRegressor(learning_rate= 0.1, max_depth= 3)

In [71]:
import joblib
joblib.dump(modelR,'chuRF.pkl')
joblib.dump(modelS,'chuSVM.pkl')
joblib.dump(modelX,'chuXGB.pkl')

['chuXGB.pkl']

In [67]:
pRg=Rg.predict(x_val)
pSg=Sg.predict(x_val_s)
pXg=Xg.predict(x_val)

In [68]:
r2_score(pRg,y_val),r2_score(pSg,y_val),r2_score(pXg,y_val)

(-1.1484134535094488, -94620.60747923933, 0.4103139781572551)

In [69]:
mean_absolute_percentage_error(pRg,y_val),mean_absolute_percentage_error(pSg,y_val),mean_absolute_percentage_error(pXg,y_val)

(0.0378984800891481, 0.20863611020784104, 0.059717012031425654)

In [86]:
test = x_train.iloc[-1]

In [89]:
x_train.iloc[-1]

arrive_traffic    20
rain-1             0
weekday_1          0
weekday_2          0
weekday_3          0
weekday_4          0
weekday_6          0
time_1             0
time_2             0
time_3             0
time_4             0
time_5             0
time_6             0
time_7             0
time_8             0
time_9             0
time_10            0
time_11            0
time_12            0
time_13            0
time_14            0
time_15            0
time_16            1
time_17            0
time_18            0
time_19            0
time_20            0
time_21            0
time_22            0
time_23            0
Name: 372, dtype: int64

In [88]:
test

arrive_traffic    20
rain-1             0
weekday_1          0
weekday_2          0
weekday_3          0
weekday_4          0
weekday_6          0
time_1             0
time_2             0
time_3             0
time_4             0
time_5             0
time_6             0
time_7             0
time_8             0
time_9             0
time_10            0
time_11            0
time_12            0
time_13            0
time_14            0
time_15            0
time_16            1
time_17            0
time_18            0
time_19            0
time_20            0
time_21            0
time_22            0
time_23            0
Name: 372, dtype: int64

In [90]:
test['time_16'] =0
test['time_12'] =1
test['weekday_4']=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['time_16'] =0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['time_12'] =1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['weekday_4']=1


In [60]:
scaledf = pd.DataFrame(x_train_s)

In [61]:
scaledf.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
count,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0
mean,0.284444,0.0,0.101877,0.067024,0.329759,0.214477,0.088472,0.037534,0.045576,0.045576,0.042895,0.042895,0.042895,0.045576,0.048257,0.048257,0.045576,0.040214,0.040214,0.040214,0.034853,0.034853,0.037534,0.037534,0.042895,0.045576,0.048257,0.048257,0.034853,0.032172
std,0.246971,0.0,0.302892,0.2504,0.470757,0.411011,0.284361,0.19032,0.208845,0.208845,0.202894,0.202894,0.202894,0.208845,0.214597,0.214597,0.208845,0.196726,0.196726,0.196726,0.183653,0.183653,0.19032,0.19032,0.202894,0.208845,0.214597,0.214597,0.183653,0.176693
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.186441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.211864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.245763,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [91]:
testdf = pd.DataFrame(test)

In [92]:
testdf=testdf.transpose()
testdf

Unnamed: 0,arrive_traffic,rain-1,weekday_1,weekday_2,weekday_3,weekday_4,weekday_6,time_1,time_2,time_3,time_4,time_5,time_6,time_7,time_8,time_9,time_10,time_11,time_12,time_13,time_14,time_15,time_16,time_17,time_18,time_19,time_20,time_21,time_22,time_23
372,20,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [95]:
txg=Xg.predict(testdf)

In [96]:
trg=Rg.predict(testdf)

In [97]:
txg,trg,(txg+trg)/2

(array([19091.094], dtype=float32),
 array([15690.49482534]),
 array([17390.79428767]))