In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import sklearn

# 1.데이터 준비

### 1.1 데이터 불러오기

In [43]:
df = pd.read_csv('../data/SeoulBikeData_labeling_fin.csv', encoding='cp949')
df

Unnamed: 0,Date,Year,Month,Day,Rented Bike Count,Hour,Temperature(℃),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(℃),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,2017-12-01,2017,12,1,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,2017-12-01,2017,12,1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,2017-12-01,2017,12,1,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,2017-12-01,2017,12,1,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,2017-12-01,2017,12,1,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-11-30,2018,11,30,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,2018-11-30,2018,11,30,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,2018-11-30,2018,11,30,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,2018-11-30,2018,11,30,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes


### 1.2 날짜 데이터 형식 변환하기

In [44]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

### 1.3 범주형 데이터를 더미변수로 변경하기

In [45]:
df = pd.get_dummies(df, columns=['Seasons', 'Holiday', 'Functioning Day'])
df

Unnamed: 0,Date,Year,Month,Day,Rented Bike Count,Hour,Temperature(℃),Humidity(%),Wind speed (m/s),Visibility (10m),...,Rainfall(mm),Snowfall (cm),Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter,Holiday_Holiday,Holiday_No Holiday,Functioning Day_No,Functioning Day_Yes
0,2017-12-01,2017,12,1,254,0,-5.2,37,2.2,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
1,2017-12-01,2017,12,1,204,1,-5.5,38,0.8,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
2,2017-12-01,2017,12,1,173,2,-6.0,39,1.0,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
3,2017-12-01,2017,12,1,107,3,-6.2,40,0.9,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
4,2017-12-01,2017,12,1,78,4,-6.0,36,2.3,2000,...,0.0,0.0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-11-30,2018,11,30,1003,19,4.2,34,2.6,1894,...,0.0,0.0,1,0,0,0,0,1,0,1
8756,2018-11-30,2018,11,30,764,20,3.4,37,2.3,2000,...,0.0,0.0,1,0,0,0,0,1,0,1
8757,2018-11-30,2018,11,30,694,21,2.6,39,0.3,1968,...,0.0,0.0,1,0,0,0,0,1,0,1
8758,2018-11-30,2018,11,30,712,22,2.1,41,1.0,1859,...,0.0,0.0,1,0,0,0,0,1,0,1


# 2.데이터 분할

### 2.1 데이터 분할하기

In [46]:
X = df.drop(['Date', 'Year', 'Month', 'Day', 'Rented Bike Count'], axis=1)
y = df['Rented Bike Count']

### 2.2 MSE, RMSE, r2계수 함수 정의

In [47]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_score(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)


    print(f'mse - {mse:.4f}')
    print(f'rmse - {rmse: .4f}')
    print(f'r2 - {r2: .4f}')
    return

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
print(X_train.shape)
print(X_train[X_train['Seasons_Spring'] == 1].shape)
print(X_train[X_train['Seasons_Summer'] == 1].shape)
print(X_train[X_train['Seasons_Autumn'] == 1].shape)
print(X_train[X_train['Seasons_Winter'] == 1].shape)
print('-' * 10)
print(X_train[X_train['Holiday_Holiday'] == 1].shape)
print(X_train[X_train['Holiday_No Holiday'] == 1].shape)

(7008, 17)
(1790, 17)
(1754, 17)
(1753, 17)
(1711, 17)
----------
(352, 17)
(6656, 17)


(1753, 17)

# 3.모델 학습

### 3.1 다중 선형회귀 모델

#### 3.1.1 모델 구축하기

In [38]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

### 3.1.2 모델 학습하기

In [39]:
reg = lr.fit(X_train, y_train)
reg

In [40]:
coef = reg.coef_
inter = reg.intercept_

coef, inter

(array([ 2.83062326e+01,  1.74536992e+01, -1.02133086e+01,  1.84308059e+01,
         1.16730645e-02,  9.80200377e+00, -7.70664597e+01, -6.16084667e+01,
         2.92488642e+01,  1.68554611e+02,  2.44460147e+01,  3.22911819e+00,
        -1.96229744e+02, -6.28110374e+01,  6.28110374e+01, -4.74047034e+02,
         4.74047034e+02]),
 209.84658706454206)

In [41]:
y_pred = reg.predict(X_test)

evaluate_score(y_test, y_pred)

mse - 194288.2051
rmse -  440.7814
r2 -  0.5337


### 3.2 Ridge 모델