### 주택 가격 예측 (House Price Regression)

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

# 경고문 미표시
import warnings
warnings.filterwarnings(action='ignore')

# 데이터프레임 표시 제한 설정
pd.options.display.max_rows = 30
pd.options.display.max_columns = 30

### 데이터 불러오기

In [2]:
from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)

housing.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

- 데이터 상위 행 살펴보기

In [3]:
data = housing['frame']
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# OverallQual, GrLivArea, GarageCars 변수 선택
features = ['OverallQual', 'GrLivArea', 'GarageCars']
target = 'SalePrice'

X = data[features]
y = data[target]

In [5]:
X = data.select_dtypes(include=['number'])
X = X.drop(['Id', 'SalePrice'], axis=1)
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,150,856,856,854,0,...,1,8,0,2003.0,2,548,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,284,1262,1262,0,0,...,1,6,1,1976.0,2,460,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,434,920,920,866,0,...,1,6,1,2001.0,2,608,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,540,756,961,756,0,...,1,7,1,1998.0,3,642,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,490,1145,1145,1053,0,...,1,9,1,2000.0,3,836,192,84,0,0,0,0,0,12,2008


In [6]:
# 결측치 처리
X = X.fillna(X.mean()) # 평균값으로 결측치 대체
X.isnull().sum().sort_values(ascending=False)

MSSubClass     0
LotFrontage    0
LotArea        0
OverallQual    0
OverallCond    0
              ..
ScreenPorch    0
PoolArea       0
MiscVal        0
MoSold         0
YrSold         0
Length: 36, dtype: int64

In [7]:
# Train - Test 분할 (Hold-out Validation)
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=X['OverallQual'], random_state=42)

print(X_tr.shape, y_tr.shape)
print(X_val.shape, y_val.shape)

(1168, 36) (1168,)
(292, 36) (292,)


### 피처 스케일링 (Feature Scaling)

- MinMaxScaler

    - 데이터를 0 과 1 사이 범위로 변환한다.

    - 각 값은 최소값과 최대값 사이의 상대적 위치로 스케일링된다.

    - 모든 특성값이 동일한 범위를 가지게 되므로 특정 특성이 다른 특성보다 모델에 더 큰 영향을 미치는 것을 방지한다.

    - 이상치(outlier) 에 민감하다. (공식 참조)

    - 사용 사례 :

        - KNN (K-Nearest Neighbors)

        - K-Means

        - 신경망(Neural Networks)

    - 공식 :
$$
X_{\text{scaled}} = \frac{X - X_{\text{min}}}{X_{\text{max}} - X_{\text{min}}}
$$

- StandardScaler 

    - 데이터를 표준 정규 분포(Standard Normal Distribution) 로 변환한다.

    - 평균은 0, 표준 편차는 1 이 된다.

    - MinMaxScaler 에 비해 이상치의 영향을 덜 받지만 제거되는 것은 아니다.

    - 사용 사례 :

        - 선형 회귀

        - 로지스틱 회귀

        - SVM(Support Vector Machine)

        - PCA(주성분 분석) 과 같은 차원 축소 기법

        - 공식 :
$$
X_{\text{scaled}} = \frac{X - \mu}{\sigma}
$$

$$
{\mu} : 평균
$$
$$
{\sigma} : 표준편차
$$

- RobustScaler

    - 중앙값과 IQR(사분위수 범위)을 사용해 이상치에 덜 민감하다.

$$
X_{\text{scaled}} = \frac{X - Q_2}{Q_3 - Q_1}
$$

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)

In [9]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_tr_scaled, y_tr)

print(f"훈련 셋: {knn.score(X_tr_scaled, y_tr)}")
print(f"검증 셋: {knn.score(X_val_scaled, y_val)}")

훈련 셋: 0.8741587673389153
검증 셋: 0.69929404464142


In [10]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X_tr_scaled, y_tr)

print(f"훈련 셋: {svr.score(X_tr_scaled, y_tr)}")
print(f"검증 셋: {svr.score(X_val_scaled, y_val)}")

훈련 셋: -0.05039100813187658
검증 셋: -0.04805251541693867
