# T-Money taxi 수요 LassoRegression 예측  
__2020.04.28__
__유휘근__  
  
LassoRegression을 이용하여 선릉역 반경 150m에서 2018.10 ~ 2019.03 까지의 시간대(1h단위) 택시 승차 데이터로 수요 예측하기  

1. 방법론 : Regression (회귀)
2. 방법론에 사용할 알고리즘 및 라이브러리 : LassoRegression(라쏘회귀) Python scikit-learn의 Lasso
3. 알고리즘에 사용되는 파라미터 : alpha (라쏘회귀의 alpha L1(Regularization) 규제 계수
4. 평가방법  
  - 폴드 4 교차검증
  - MSE(Mean Squared Error) : 실제값과 예측값의 차이를 제곱해 평균한 것.
  - RMSE(Root Mean Squared Error) : MSE에 루트를 씌운 것.

## 모듈 import

In [43]:
import math
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.model_selection import cross_val_score

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.font_manager as fm
from matplotlib import rc

# 시각화 그래프 '-' & 한글 깨짐 현상 처리
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rc

mpl.rcParams['axes.unicode_minus'] = False
font_name = fm.FontProperties(fname='c:\\windows\\fonts\\malgun.ttf').get_name()
rc('font', family=font_name)

## 데이터 준비

### 데이터 로드

In [2]:
taxi = pd.read_csv('data/T-Money/seolleung_dataset.csv')
taxi

Unnamed: 0,o_time,yesterday,today,tomorrow,temp,windspeed,humidity,cloud,precipitation,snowcover,visibility,user,count
0,2018-10-01 00:00:00,1,0,0,14.1,2.6,55,0.0,0.0,0.0,2000,1.0,52
1,2018-10-01 01:00:00,1,0,0,13.9,2.0,57,2.0,0.0,0.0,2000,1.0,17
2,2018-10-01 02:00:00,1,0,0,13.9,2.4,56,4.0,0.0,0.0,2000,1.0,10
3,2018-10-01 03:00:00,1,0,0,13.6,1.6,56,8.0,0.0,0.0,2000,1.0,13
4,2018-10-01 04:00:00,1,0,0,13.7,1.9,56,10.0,0.0,0.0,2000,1.0,12
5,2018-10-01 05:00:00,1,0,0,13.7,1.8,57,8.0,0.0,0.0,1999,1.0,8
6,2018-10-01 06:00:00,1,0,0,13.3,2.3,56,7.0,0.0,0.0,2000,1.0,14
7,2018-10-01 07:00:00,1,0,0,13.1,2.1,58,3.0,0.0,0.0,1997,1.0,31
8,2018-10-01 08:00:00,1,0,0,13.7,2.4,54,7.0,0.0,0.0,1960,1.0,148
9,2018-10-01 09:00:00,1,0,0,14.9,3.4,50,1.0,0.0,0.0,1989,1.0,130


### 년,월,일,시간 변수 추가

In [3]:
# 시간 변수 추가를 위해 o_time 컬럼 데이터타입 문자열에서 datetime 으로 변경
taxi['o_time'] = taxi.o_time.apply(pd.to_datetime)
#taxi['o_time'] = pd.to_datetime(taxi['o_time'],format='%Y-%m-%d %H:%M:%S')

# datetime 타입에서 년,월,일,시간 추출
taxi['year'] = taxi['o_time'].dt.year
taxi['month'] = taxi['o_time'].dt.month
taxi['day'] = taxi['o_time'].dt.day
taxi['hour'] = taxi['o_time'].dt.hour
taxi

Unnamed: 0,o_time,yesterday,today,tomorrow,temp,windspeed,humidity,cloud,precipitation,snowcover,visibility,user,count,year,month,day,hour
0,2018-10-01 00:00:00,1,0,0,14.1,2.6,55,0.0,0.0,0.0,2000,1.0,52,2018,10,1,0
1,2018-10-01 01:00:00,1,0,0,13.9,2.0,57,2.0,0.0,0.0,2000,1.0,17,2018,10,1,1
2,2018-10-01 02:00:00,1,0,0,13.9,2.4,56,4.0,0.0,0.0,2000,1.0,10,2018,10,1,2
3,2018-10-01 03:00:00,1,0,0,13.6,1.6,56,8.0,0.0,0.0,2000,1.0,13,2018,10,1,3
4,2018-10-01 04:00:00,1,0,0,13.7,1.9,56,10.0,0.0,0.0,2000,1.0,12,2018,10,1,4
5,2018-10-01 05:00:00,1,0,0,13.7,1.8,57,8.0,0.0,0.0,1999,1.0,8,2018,10,1,5
6,2018-10-01 06:00:00,1,0,0,13.3,2.3,56,7.0,0.0,0.0,2000,1.0,14,2018,10,1,6
7,2018-10-01 07:00:00,1,0,0,13.1,2.1,58,3.0,0.0,0.0,1997,1.0,31,2018,10,1,7
8,2018-10-01 08:00:00,1,0,0,13.7,2.4,54,7.0,0.0,0.0,1960,1.0,148,2018,10,1,8
9,2018-10-01 09:00:00,1,0,0,14.9,3.4,50,1.0,0.0,0.0,1989,1.0,130,2018,10,1,9


### o_time 컬럼 제거

In [4]:
taxi.drop(['o_time'],axis=1, inplace=True)

## 모델 생성 및 평가

### train, test DataSet 분리

In [5]:
y_target = taxi['count']
X_data = taxi.drop(['count'], axis=1,inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.3, random_state=156)

### 라쏘회귀 모델 4폴드 교차검증

In [16]:
# alpha 파리미터 10 설정
lasso = Lasso(alpha=0.05)
neg_mse_scores = cross_val_score(lasso, X_data, y_target, scoring='neg_mean_squared_error',cv=4)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

### 평가

In [17]:
print('4 folds의 개별 Negative MSE scores : ',np.round(neg_mse_scores,3))
print('4 folds의 개별 RMSE scores : ',np.round(rmse_scores,3))
print('4 folds의 평균 RMSE : {0:.3f} '.format(avg_rmse))

4 folds의 개별 Negative MSE scores :  [-1556.031 -1608.149 -1570.547 -1562.39 ]
4 folds의 개별 RMSE scores :  [39.447 40.102 39.63  39.527]
4 folds의 평균 RMSE : 39.676 


### 자동화 함수 생성

In [46]:
# alpha 값에 따른 회귀 모델의 폴드 평균 RMSE를 출력하고 회귀 계수값들을 DataFrame으로 return 
def get_linear_reg_eval(model_name, params=None, X_data_n=None, y_target_n=None,verbose=True):
    coeff_df = pd.DataFrame()
    if verbose : print('####### ',model_name, ' #######')
    for param in params:
        if model_name == 'Ridge':model = Ridge(alpha=param)
        elif model_name == 'Lasso':model = Lasso(alpha=param)
        elif model_name == 'ElasticNet':model = ElasticNet(alpha=param, l1_ratio=0.9)
        neg_mse_scores = cross_val_score(model, X_data_n, y_target_n,
                                        scoring='neg_mean_squared_error',cv=4)
        avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
        print('alpha {0}일 때 4폴드 세트의 평균 RMSE : {1:.3f}'.format(param, avg_rmse))
        # cross_val_score는 evaluation metric 만 반환하므로 모델 다시 학습, 회귀계수 추출
        model.fit(X_data, y_target)
        # alpha에 따른 피쳐별 회귀계수를 Series로 변환 -> DataFrame 칼럼 추가
        coeff = pd.Series(data=model.coef_,index=X_data.columns)
        colname = 'alpha:'+str(param)
        coeff_df[colname]=coeff
    return coeff_df

#### 최적의 파라미터 값 탐색

In [47]:
lasso_alphas = [0.009,0.1,0.15,0.5,1,3,10,50,100]
coeff_lasso_df = get_linear_reg_eval('ElasticNet', params=lasso_alphas, X_data_n=X_data, y_target_n=y_target)

#######  ElasticNet  #######
alpha 0.009일 때 4폴드 세트의 평균 RMSE : 39.743
alpha 0.1일 때 4폴드 세트의 평균 RMSE : 39.648
alpha 0.15일 때 4폴드 세트의 평균 RMSE : 39.662
alpha 0.5일 때 4폴드 세트의 평균 RMSE : 39.838
alpha 1일 때 4폴드 세트의 평균 RMSE : 40.202
alpha 3일 때 4폴드 세트의 평균 RMSE : 41.626
alpha 10일 때 4폴드 세트의 평균 RMSE : 43.365
alpha 50일 때 4폴드 세트의 평균 RMSE : 43.826
alpha 100일 때 4폴드 세트의 평균 RMSE : 44.318


#### 파라미터 값에 따른 회귀계수 확인

In [41]:
sort_column = 'alpha:'+str(lasso_alphas[0])
coeff_lasso_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:0.089,alpha:0.1,alpha:0.15,alpha:0.5,alpha:1,alpha:3,alpha:10,alpha:50,alpha:100
user,123.339813,123.102035,122.032683,115.03768,106.330787,81.620521,45.001894,12.567134,6.560852
tomorrow,4.390567,4.39157,4.396076,4.425263,4.460804,4.554725,4.649051,4.429911,4.095871
hour,1.405137,1.405099,1.40493,1.403826,1.402456,1.398605,1.393075,1.388358,1.386956
windspeed,0.329286,0.329851,0.33239,0.348906,0.369212,0.424811,0.497313,0.544702,0.567554
day,0.06429,0.06426,0.064124,0.063242,0.062163,0.059271,0.056037,0.059916,0.067541
snowcover,0.044246,0.044337,0.044749,0.047432,0.050751,0.059809,0.068149,0.000648,-0.103005
visibility,-0.003513,-0.003512,-0.003509,-0.003487,-0.00346,-0.003369,-0.003172,-0.002775,-0.002593
cloud,-0.046719,-0.046672,-0.046463,-0.045122,-0.043523,-0.039561,-0.036663,-0.042565,-0.045088
humidity,-0.069819,-0.069787,-0.06964,-0.068666,-0.06742,-0.06361,-0.056484,-0.044644,-0.039526
temp,-0.21763,-0.217629,-0.217625,-0.217564,-0.217391,-0.216106,-0.210038,-0.191072,-0.182428
