# 데이터 로드

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# 데이터 불러오기
data = pd.read_csv("./data/kc_house_data.csv") 
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [3]:
'''
id: 집 고유아이디
date: 집이 팔린 날짜 
price: 집 가격 (타겟변수)
bedrooms: 주택 당 침실 개수
bathrooms: 주택 당 화장실 개수
floors: 전체 층 개수
waterfront: 해변이 보이는지 (0, 1)
condition: 집 청소상태 (1~5)
grade: King County grading system 으로 인한 평점 (1~13)
yr_built: 집이 지어진 년도
yr_renovated: 집이 리모델링 된 년도
zipcode: 우편번호
lat: 위도
long: 경도
'''

'\nid: 집 고유아이디\ndate: 집이 팔린 날짜 \nprice: 집 가격 (타겟변수)\nbedrooms: 주택 당 침실 개수\nbathrooms: 주택 당 화장실 개수\nfloors: 전체 층 개수\nwaterfront: 해변이 보이는지 (0, 1)\ncondition: 집 청소상태 (1~5)\ngrade: King County grading system 으로 인한 평점 (1~13)\nyr_built: 집이 지어진 년도\nyr_renovated: 집이 리모델링 된 년도\nzipcode: 우편번호\nlat: 위도\nlong: 경도\n'

In [4]:
ncar = data.shape[0]
nvar = data.shape[1]
print(ncar)
print(nvar)

21613
14


## 의미가 없다고 판단되는 변수 제거

In [5]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis=1)

## 범주형 변수를 이진형 변수로 변환

범주형 변수는 waterfront 컬럼 뿐이며, 이진 분류이기 때문에 0, 1로 표현한다.  
데이터에서 0, 1로 표현되어 있으므로 과정 생략

## 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [6]:
feature_columns = list(data.columns.difference(['price']))
X = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42) # 학습데이터와 평가데이터의 비율은 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


# Linear Regression을 활용한 Ensemble 적합

## 단일한 Linear Regression 적합 후, 평가 데이터로 검증

In [7]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [8]:
sm_train_x = sm.add_constant(train_x, has_constant="add")
sm_model = sm.OLS(train_y, sm_train_x)
fitted_sm_model = sm_model.fit()
fitted_sm_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,2776.0
Date:,"Fri, 25 Jun 2021",Prob (F-statistic):,0.0
Time:,14:42:20,Log-Likelihood:,-208260.0
No. Observations:,15129,AIC:,416500.0
Df Residuals:,15120,BIC:,416600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.186e+06,1.73e+05,41.548,0.000,6.85e+06,7.52e+06
bathrooms,1.303e+05,3960.833,32.889,0.000,1.23e+05,1.38e+05
bedrooms,-2224.7910,2382.356,-0.934,0.350,-6894.497,2444.915
condition,1.641e+04,3169.013,5.178,0.000,1.02e+04,2.26e+04
floors,1946.3052,4336.838,0.449,0.654,-6554.422,1.04e+04
grade,1.956e+05,2199.540,88.924,0.000,1.91e+05,2e+05
waterfront,7.555e+05,2.26e+04,33.479,0.000,7.11e+05,8e+05
yr_built,-4300.7865,88.073,-48.832,0.000,-4473.420,-4128.153
yr_renovated,12.7325,5.043,2.525,0.012,2.847,22.618

0,1,2,3
Omnibus:,13447.374,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1684794.827
Skew:,3.763,Prob(JB):,0.0
Kurtosis:,54.147,Cond. No.,182000.0


In [9]:
sm_test_x = sm.add_constant(test_x, has_constant="add")
sm_model_predict = fitted_sm_model.predict(sm_test_x)

In [10]:
sqrt(mean_squared_error(sm_model_predict, test_y))  # RMSE

239804.2967085816

## 직접 for문 작성하여 Bagging 적용

직접 for문을 사용하면 원하는대로 custom이 가능한 장점이 있다

In [11]:
import random
bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])]
    random_data_index = np.random.choice(data_index, train_x.shape[0])  # 복원 추출
    print(len(set(random_data_index)))  # unique한 데이터의 개수
    sm_train_x = train_x.iloc[random_data_index]
    sm_train_y = train_y.iloc[random_data_index]
    sm_train_x = sm.add_constant(sm_train_x, has_constant="add")
    sm_model = sm.OLS(sm_train_y, sm_train_x)
    fitted_sm_model = sm_model.fit()
    pred = fitted_sm_model.predict(sm_test_x)
    bagging_predict_result.append(pred)
    print(sqrt(mean_squared_error(pred, test_y)))
    

9617
239998.6115262969
9607
239976.1770765595
9484
239961.72346060554
9525
240046.514077585
9622
239844.62782326413
9558
241500.36343433516
9625
241082.397016957
9518
239743.27807700992
9569
240970.4892744418
9662
240052.14420730728


In [12]:
pd.DataFrame(bagging_predict_result[3]).head()   

Unnamed: 0,0
735,561017.1
2830,712057.7
4106,1105009.0
16218,1458761.0
19964,694714.1


각각의 bootstrap에서 추출한 데이터를 바탕으로 모델을 적합, 해당 모델을 이용한 예측값이 담겨 있음

In [13]:
bagging_predict = []
for lst2_index in range(test_x.shape[0]):
    temp_predict = []
    for lst_index in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index])
    bagging_predict.append(np.mean(temp_predict))

In [14]:
sqrt(mean_squared_error(bagging_predict, test_y))

239972.5520599593

오히려 RMSE가 크게 나타났다. 이는 Linear Regression이 과적합이 심한 모델이 아니기 때문이다.  
Tree 모델 같은 Learner를 사용하면 RMSE가 감소한다

## sklearn 사용하여 Bagging 적용

In [15]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=5)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x)

print(sqrt(mean_squared_error(predict2, test_y)))

239880.6835216398


### Sampling 횟수 증가시

In [16]:
from sklearn.ensemble import BaggingRegressor
bagging_model2 = BaggingRegressor(base_estimator=regression_model, n_estimators=5)
linear_model3 = bagging_model2.fit(train_x, train_y)
predict3 = linear_model3.predict(test_x)

print(sqrt(mean_squared_error(predict3, test_y)))

239946.19700319946


# Decision Tree를 활용한 Ensemble 적합


## 단일한 의사결정 나무 모형에 적합 후, 평가 데이터로 검증

In [17]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor()
tree_model = decision_tree_model.fit(train_x, train_y)
predict_tree = tree_model.predict(test_x)
print(sqrt(mean_squared_error(predict_tree, test_y)))

300621.35354675754


선형회귀에 비해 성능이 훨씬 안 좋아졌다

## 직접 for문 작성하여 Bagging 적용

In [18]:
bagging_predict_result = []
for _ in range(30):
    data_index = [data_index for data_index in range(train_x.shape[0])]
    random_data_index = np.random.choice(data_index, train_x.shape[0])
    print(len(set(random_data_index)))
    sm_train_x = train_x.iloc[random_data_index]
    sm_train_y = train_y.iloc[random_data_index]
    decision_tree_model = DecisionTreeRegressor()
    tree_model1 = decision_tree_model.fit(sm_train_x, sm_train_y)
    
    predict1 = tree_model1.predict(test_x)
    bagging_predict_result.append(predict1)
    print(sqrt(mean_squared_error(predict1, test_y)))

9624
281221.8757851833
9618
298112.9139665333
9593
301577.85822834505
9459
319701.8299395819
9604
293704.11888328526
9585
297863.8252031202
9539
307198.75611637556
9662
284435.6671380531
9612
301684.52238905814
9515
287771.41843129834
9568
290103.8924156566
9556
295113.6630195127
9549
292229.9332012473
9541
292048.34514683136
9639
286880.3404885375
9628
279406.1991935257
9590
277998.08958938
9499
285658.7525909235
9505
294650.52315285493
9547
292527.47882245627
9593
290087.3686675864
9602
290344.6279824194
9554
291991.81670391676
9587
282872.1715034723
9581
307109.16120225645
9593
277818.10124120506
9576
294410.0623297406
9569
276277.62636122474
9576
280638.5188360814
9510
284580.4658417395


In [19]:
bagging_predict = []
for lst2_index in range(test_x.shape[0]):
    temp_predict = []
    for lst_index in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[lst_index][lst2_index])
    bagging_predict.append(np.mean(temp_predict))

In [20]:
sqrt(mean_squared_error(bagging_predict, test_y))

232882.09876938316

RMSE가 크게 줄었음, 성능이 향상됨

## sklearn 사용하여 Bagging 적용

In [21]:
bagging_model = BaggingRegressor(base_estimator=decision_tree_model, n_estimators=10)
tree_model2 = bagging_model.fit(train_x, train_y)
predict2 = tree_model2.predict(test_x)
print(sqrt(mean_squared_error(predict2, test_y)))

241383.378245587


### Sampling 횟수 증가시

In [22]:
bagging_model = BaggingRegressor(base_estimator=decision_tree_model, n_estimators=30)
tree_model2 = bagging_model.fit(train_x, train_y)
predict2 = tree_model2.predict(test_x)
print(sqrt(mean_squared_error(predict2, test_y)))

238069.02727560262


Sampling 횟수를 늘리면 평균적으로 RMSE가 작게 나온다  
but, 30번보다 많이 한다고 해서 성능이 크게 향상되지는 않는다.