# 장애인 콜택시 대기시간 예측 프로젝트 데이터 처리
## 목표
- 기상 예보를 바탕으로, 장애인 콜택시 평균 대기 시간을 예측하기.
- 장애인 이동권 개선을 위하여서, 콜택시 대기시간을 예측하는 프로젝트이다.  
- 이를 통하여 장애인 콜택시를 이용하는 고객들의 불편사항을 개선하고 서비스의 품질을 높여 교통약자의 이동 편의 증진에 기여한다.
### 목차
1. 데이터 처리  
2. 데이터 분석
3. **머신러닝 모델링**   
    - (나머지 3달 데이터는 테스트, 나머지는 학습데이터로 활용)  
    - MAE, MAPE를 사용해 성능 높은 모델 선정  
- 데이터 출처 : [서울시설공단_장애인콜택시 일별 이용현황](https://www.data.go.kr/data/15057705/openapi.do)  

## 라이브러리 불러오기

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# 회귀 모델
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 회귀 성능 튜닝
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# 회귀 평가
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape

# 딥러닝
import tensorflow as tf

import warnings
warnings.filterwarnings(action='ignore') # pd 경고메시지 생략

## 데이터 불러오기

In [18]:
# df = pd.read_csv('./data/df.csv')
df = joblib.load('./data/df.pkl')
df.head()

Unnamed: 0_level_0,car_cnt,request_cnt,ride_cnt,waiting_time,fare,distance,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2),year,month,week,weekday,holiday,7days_waiting_time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-01-01,213,1023,924,23.2,2427,10764,-2.0,-8.9,0.0,63.0,28.0,9.07,2015,1,1,3,1.0,
2015-01-02,420,3158,2839,17.2,2216,8611,2.4,-9.2,0.0,73.0,37.0,8.66,2015,1,1,4,0.0,
2015-01-03,209,1648,1514,26.2,2377,10198,8.2,0.2,0.0,89.0,58.0,5.32,2015,1,1,5,1.0,
2015-01-04,196,1646,1526,24.5,2431,10955,7.9,-0.9,0.0,95.0,52.0,6.48,2015,1,1,6,1.0,
2015-01-05,421,4250,3730,26.2,2214,8663,4.1,-7.4,3.4,98.0,29.0,10.47,2015,1,2,0,0.0,


#### NaN 값 처리, 타입 변경하기

In [19]:
drop_col = '7days_waiting_time'
df = df.drop(drop_col, axis=1)

In [20]:
df['weekday'] = df['weekday'].astype('int') # 해당 코드 진행시 변경이 안됌

In [21]:
df = df.astype('float')

In [22]:
df

Unnamed: 0_level_0,car_cnt,request_cnt,ride_cnt,waiting_time,fare,distance,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2),year,month,week,weekday,holiday
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-01-01,213.0,1023.0,924.0,23.2,2427.0,10764.0,-2.0,-8.9,0.0,63.0,28.0,9.07,2015.0,1.0,1.0,3.0,1.0
2015-01-02,420.0,3158.0,2839.0,17.2,2216.0,8611.0,2.4,-9.2,0.0,73.0,37.0,8.66,2015.0,1.0,1.0,4.0,0.0
2015-01-03,209.0,1648.0,1514.0,26.2,2377.0,10198.0,8.2,0.2,0.0,89.0,58.0,5.32,2015.0,1.0,1.0,5.0,1.0
2015-01-04,196.0,1646.0,1526.0,24.5,2431.0,10955.0,7.9,-0.9,0.0,95.0,52.0,6.48,2015.0,1.0,1.0,6.0,1.0
2015-01-05,421.0,4250.0,3730.0,26.2,2214.0,8663.0,4.1,-7.4,3.4,98.0,29.0,10.47,2015.0,1.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-27,669.0,5635.0,4654.0,44.4,2198.0,8178.0,-0.3,-5.4,0.1,92.0,40.0,10.86,2022.0,12.0,52.0,1.0,0.0
2022-12-28,607.0,5654.0,4648.0,44.8,2161.0,7882.0,1.7,-7.8,0.0,71.0,34.0,10.88,2022.0,12.0,52.0,2.0,0.0
2022-12-29,581.0,5250.0,4247.0,52.5,2229.0,8433.0,2.1,-4.0,0.0,87.0,38.0,10.84,2022.0,12.0,52.0,3.0,0.0
2022-12-30,600.0,5293.0,4200.0,38.3,2183.0,8155.0,-4.4,-4.4,0.0,66.0,66.0,0.00,2022.0,12.0,52.0,4.0,0.0


# 데이터 나누기
* 시계열 데이터 특성에 맞게 분할  
* 마지막 91일(3개월) 데이터를 검증셋으로 사용

#### x,y 분리하기

In [23]:
target = 'waiting_time'

x = df.drop(target, axis=1)
y = df.loc[:,target]

In [24]:
x.head(2)

Unnamed: 0_level_0,car_cnt,request_cnt,ride_cnt,fare,distance,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2),year,month,week,weekday,holiday
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-01-01,213.0,1023.0,924.0,2427.0,10764.0,-2.0,-8.9,0.0,63.0,28.0,9.07,2015.0,1.0,1.0,3.0,1.0
2015-01-02,420.0,3158.0,2839.0,2216.0,8611.0,2.4,-9.2,0.0,73.0,37.0,8.66,2015.0,1.0,1.0,4.0,0.0


In [25]:
y.head(2)

time
2015-01-01    23.2
2015-01-02    17.2
Name: waiting_time, dtype: float64

#### train, validation 나누기

In [26]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 92, shuffle=False)

#### 정규화
- knn 사용을 위해서

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_train)

# 데이터 모델링

- MAE, MAPE로 성능 측정
- MAE와 MAPE는 모두 값이 작을수록 모델의 성능이 좋다는 것을 의미한다.

## ML

#### ML모델 선언

In [28]:
LR_model = LinearRegression()
KN_model = KNeighborsRegressor()
Tree_model = DecisionTreeRegressor()
RanF_model = RandomForestRegressor()
xg_model = XGBRegressor()
lg_model = LGBMRegressor()

models = {'LR_model':LR_model,
        'KN_model':KN_model,
        'Tree_model':Tree_model,
        'RanF_model':RanF_model,
        'xg_model':xg_model,
        'lg_model':lg_model}

#### 모델 학습 및 평가

In [29]:
results = []
for name, model in models.items():
    if model != 'KN_model':
        model.fit(x_train, y_train) # 모델 학습
        y_pred = model.predict(x_test) # 모델 예측
        model_mae = mae(y_test, y_pred)
        model_mape = mape(y_test, y_pred)
    else:
        model.fit(x_train_s, y_train)
        y_pred = model.predict(x_test_s)
        model_mae = mae(y_test, y_pred)
        model_mape = mape(y_test, y_pred)
    results.append([name, (model_mae, model_mape)])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2395
[LightGBM] [Info] Number of data points in the train set: 2831, number of used features: 16
[LightGBM] [Info] Start training from score 40.192794


In [30]:
sorted(results, key=lambda x: x[1][1])

[['lg_model', (3.98069572960794, 0.0922415968802517)],
 ['KN_model', (3.8902173913043474, 0.09224329725907993)],
 ['xg_model', (3.978145848149839, 0.09523714185397465)],
 ['RanF_model', (4.152217391304349, 0.09865747620382924)],
 ['LR_model', (4.610387378195424, 0.1162958967265707)],
 ['Tree_model', (4.941304347826087, 0.11817417069744049)]]

#### 모델 튜닝
- 해당부분, 공유를 통하여 문제 해결하면 좋을것 같다는 생각함

##### KNN 모델

In [31]:
# 파라미터 선언
param = {'n_neighbors': range(1, 500, 10),
        'metric': ['euclidean', 'manhattan']}

# Random Search 선언
model = GridSearchCV(KN_model,
                            param,
                            cv=3)
# 학습하기
model.fit(x_train_s, y_train)

# 중요 정보 확인
print('=' * 80)
print(model.cv_results_['mean_test_score'])
print('-' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)

# 예측하기
y_pred = model.predict(x_test_s)

[-3.14988452 -3.18536883 -3.46703641 -3.61230397 -3.70458865 -3.74456733
 -3.7486516  -3.723864   -3.67738877 -3.62084598 -3.55800847 -3.49590346
 -3.43069401 -3.36634818 -3.30057635 -3.24235023 -3.18123512 -3.12149263
 -3.06064392 -3.00397011 -2.94840821 -2.89441697 -2.84148898 -2.79138789
 -2.74537039 -2.69966628 -2.65764882 -2.61821801 -2.5790347  -2.54110399
 -2.50606638 -2.47292527 -2.44042741 -2.41058186 -2.38279703 -2.35482432
 -2.32820748 -2.30489614 -2.28029142 -2.2557219  -2.23269278 -2.21197734
 -2.19245371 -2.17183756 -2.15343715 -2.13543591 -2.11817711 -2.10290511
 -2.08770339 -2.07342732 -3.55904648 -3.140117   -3.12031796 -3.04132157
 -2.96787061 -2.9018036  -2.81843166 -2.74037966 -2.66247947 -2.59810798
 -2.53725702 -2.47534893 -2.42215312 -2.37245626 -2.32669448 -2.28460295
 -2.24631741 -2.21448749 -2.18136489 -2.15284148 -2.1262737  -2.10388417
 -2.0814092  -2.06271093 -2.04392406 -2.02824833 -2.01437812 -2.00164559
 -1.98920232 -1.97722125 -1.96871448 -1.96062007 -1

##### RandomForest 모델

In [32]:
# 파라미터 선언
# max_depth: 1~50
param = {'max_depth': range(1,51)}

# Random Search 선언
  # cv=5
  # n_iter=20
  # scoring='r2'
model = RandomizedSearchCV(RanF_model, # 기본 모델
                          param,       # 파라미터 범위
                          cv=5,        # K-Folde 개수
                          n_iter=20   # 랜덤하게 선택할 파라미터(조합) 개수
                          )
# 학습하기
model.fit(x_train, y_train)

# 중요 정보 확인
print('=' * 80)
print(model.cv_results_['mean_test_score'])
print('-' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)

# 예측하기
y_pred = model.predict(x_test)

print('MAE:', mae(y_test, y_pred))
print('MAPE:', mape(y_test, y_pred))

[-0.4691863   0.0032705   0.01511978 -0.00157816 -0.16025958 -0.00872906
 -0.03492186 -0.02095703 -0.0292011   0.00710647 -0.22024641 -0.58937563
 -0.07902146  0.01738483 -0.04204579 -0.00645058  0.02424826 -0.00694008
  0.0203747  -0.02632634]
--------------------------------------------------------------------------------
최적파라미터: {'max_depth': 20}
--------------------------------------------------------------------------------
최고성능: 0.024248257848911492
MAE: 4.1909332298136635
MAPE: 0.09878810207736842


##### Decision Tree 모델

In [33]:
# 파라미터 선언
# max_depth: 1~50
param = {'max_depth': range(1,51)}

model = RandomizedSearchCV(Tree_model, # 기본 모델
                          param,       # 파라미터 범위
                          cv=5,        # K-Folde 개수
                          n_iter=20   # 랜덤하게 선택할 파라미터(조합) 개수
                          )
# 학습하기
model.fit(x_train, y_train)

# 중요 정보 확인
print('=' * 80)
print(model.cv_results_['mean_test_score'])
print('-' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)

# 예측하기
y_pred = model.predict(x_test)

print('MAE:', mae(y_test, y_pred))
print('MAPE:', mape(y_test, y_pred))

[-0.27061308 -0.2634182  -0.33515287 -0.22181397 -0.29969889 -0.23467659
 -0.33873064 -0.32000635 -0.26502998 -0.31323674 -0.28757112 -0.2846583
 -0.27550589 -0.29132239 -0.30872444 -0.25487814 -0.2971447  -0.19488455
 -0.27419698 -0.32548551]
--------------------------------------------------------------------------------
최적파라미터: {'max_depth': 11}
--------------------------------------------------------------------------------
최고성능: -0.1948845537557967
MAE: 4.854523381118124
MAPE: 0.11605782108550565


## DeapLearning

#### 모델 구조 생성