In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import sklearn

# 1.데이터 준비

### 1.1 데이터 불러오기

In [3]:
df = pd.read_csv('../data/SeoulBikeData_labeling_fin.csv', encoding='cp949')
df

Unnamed: 0,Date,Year,Month,Day,Rented Bike Count,Hour,Temperature(℃),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(℃),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,2017-12-01,2017,12,1,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,2017-12-01,2017,12,1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,2017-12-01,2017,12,1,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,2017-12-01,2017,12,1,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,2017-12-01,2017,12,1,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-11-30,2018,11,30,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,2018-11-30,2018,11,30,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,2018-11-30,2018,11,30,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,2018-11-30,2018,11,30,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes


### 1.2 날짜 데이터 형식 변환하기

In [4]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

### 1.3 범주형 데이터를 더미변수로 변경하기

In [5]:
df = pd.get_dummies(df, columns=['Seasons', 'Holiday', 'Functioning Day'])
df

Unnamed: 0,Date,Year,Month,Day,Rented Bike Count,Hour,Temperature(℃),Humidity(%),Wind speed (m/s),Visibility (10m),...,Rainfall(mm),Snowfall (cm),Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter,Holiday_Holiday,Holiday_No Holiday,Functioning Day_No,Functioning Day_Yes
0,2017-12-01,2017,12,1,254,0,-5.2,37,2.2,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
1,2017-12-01,2017,12,1,204,1,-5.5,38,0.8,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
2,2017-12-01,2017,12,1,173,2,-6.0,39,1.0,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
3,2017-12-01,2017,12,1,107,3,-6.2,40,0.9,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
4,2017-12-01,2017,12,1,78,4,-6.0,36,2.3,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-11-30,2018,11,30,1003,19,4.2,34,2.6,1894,...,0.0,0.0,1,0,0,0,0,1,0,1
8756,2018-11-30,2018,11,30,764,20,3.4,37,2.3,2000,...,0.0,0.0,1,0,0,0,0,1,0,1
8757,2018-11-30,2018,11,30,694,21,2.6,39,0.3,1968,...,0.0,0.0,1,0,0,0,0,1,0,1
8758,2018-11-30,2018,11,30,712,22,2.1,41,1.0,1859,...,0.0,0.0,1,0,0,0,0,1,0,1


# 2.데이터 분할

### 2.1 데이터 분할하기

In [17]:
X = df.drop(['Date', 'Year', 'Month', 'Day', 'Rented Bike Count'], axis=1)
y = df['Rented Bike Count']

In [18]:
X = np.array(X)
y = np.array(y)
X, y

(array([[ 0. , -5.2, 37. , ...,  1. ,  0. ,  1. ],
        [ 1. , -5.5, 38. , ...,  1. ,  0. ,  1. ],
        [ 2. , -6. , 39. , ...,  1. ,  0. ,  1. ],
        ...,
        [21. ,  2.6, 39. , ...,  1. ,  0. ,  1. ],
        [22. ,  2.1, 41. , ...,  1. ,  0. ,  1. ],
        [23. ,  1.9, 43. , ...,  1. ,  0. ,  1. ]]),
 array([254, 204, 173, ..., 694, 712, 584]))

# 3. 교차검증

### 3.1 라이브러리 임포트

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

from sklearn.metrics import r2_score

from sklearn.model_selection import train_test_split

### 3.2 모델 

In [20]:
lr = LinearRegression()

In [21]:
kfold = KFold(5)
list(kfold.split(X))

[(array([1752, 1753, 1754, ..., 8757, 8758, 8759]),
  array([   0,    1,    2, ..., 1749, 1750, 1751])),
 (array([   0,    1,    2, ..., 8757, 8758, 8759]),
  array([1752, 1753, 1754, ..., 3501, 3502, 3503])),
 (array([   0,    1,    2, ..., 8757, 8758, 8759]),
  array([3504, 3505, 3506, ..., 5253, 5254, 5255])),
 (array([   0,    1,    2, ..., 8757, 8758, 8759]),
  array([5256, 5257, 5258, ..., 7005, 7006, 7007])),
 (array([   0,    1,    2, ..., 7005, 7006, 7007]),
  array([7008, 7009, 7010, ..., 8757, 8758, 8759]))]

In [22]:
r2_scores = []

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    reg = lr.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_pred[y_pred < 0] = 0.
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

In [23]:
r2_scores

[-1.21758702875037,
 0.17844906573736996,
 0.24893756360651287,
 -0.08154391987952891,
 0.5717370649436059]

In [24]:
for i, r2 in enumerate(r2_scores):
    print(i+1, f'- R2 = {r2:.3f}')
    
print(f'average R2 = {np.round(np.mean(r2_scores),3)}')

1 - R2 = -1.218
2 - R2 = 0.178
3 - R2 = 0.249
4 - R2 = -0.082
5 - R2 = 0.572
average R2 = -0.06
