## **1. Total imports**

In [13]:
## data
import pandas as pd
import numpy as np

## model
import sklearn.linear_model
import statsmodels.api as sm
from pygam import LinearGAM, s, f
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.decomposition import KernelPCA, PCA

## cv
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from skopt import BayesSearchCV

## evaluation
from sklearn.metrics import RocCurveDisplay, confusion_matrix
from sklearn.inspection import DecisionBoundaryDisplay

## visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

## utilities
import os
import pickle
import warnings
import json

## 한글 폰트 표시
plt.rcParams['font.family'] ='NanumGothic'
plt.rcParams['axes.unicode_minus'] = False

## 렌더링 설정
pio.templates.default = 'plotly_white'
pio.renderers.default = "vscode"

## warnings 처리
warnings.filterwarnings("ignore", category=UserWarning)

## **2. Data**

### **A. 데이터 불러오기**

In [8]:
housing_price = pd.read_csv("kc_house_data.csv")

## 전처리
df_preprocessed = housing_price.drop(["id", "date", "price"], axis = 1)\
.assign(date = pd.to_datetime(housing_price.date)).assign(price = housing_price.price)

## scores recoding
scoring_dict = dict()

### **B. 자료 분할**

In [9]:
def month_days_split(df_train : pd.DataFrame, df_test : pd.DataFrame) :
    """
    train/test set에서 `date` 열을 월별과 일별로 따로 분석하기 위한 함수
    """
    test_month = df_test.date.dt.month.astype(str)
    test_days = df_test.date.map(lambda x : x - df_preprocessed.date.min()).dt.days
    train_month = df_train.date.dt.month.astype(str)
    train_days = df_train.date.map(lambda x : x - df_preprocessed.date.min()).dt.days
    
    return [train_month, train_days], [test_month, test_days]

df_train, df_test = train_test_split(df_preprocessed, test_size = 0.3, shuffle = True, random_state = 14107)
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

train_date, test_date = month_days_split(df_train, df_test)

## **3. Visualization & EDA**

### **A. 예측변수 주택 판매 가격의 분포**

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (12,5))

axs[0].hist(df_train.price, bins = 100)
axs[0].set_title("주택 가격 히스토그램")
axs[0].set_xlabel("price(100,000$)")
axs[0].set_ylabel("count")
axs[1].boxplot(df_train.price, flierprops={'marker': 'o', 'markersize': 1, 'markerfacecolor': 'fuchsia'})
axs[1].text(1.1, 0, f"min = {df_train.price.min():.0f}", fontsize=10)
axs[1].text(1.1, 375000, f"med = {df_train.price.median():.0f}", fontsize=10)
axs[1].text(1.1, 7650000, f"max = {df_train.price.max():.0f}", fontsize=10)
plt.show()

### **B. 각 설명변수들과 예측변수 간 관계 파악**

`-` 개별 설명변수(구매일자 제외)와 예측변수 간 산점도

In [None]:
## 열 설명 딕셔너리 작성
description_list = ["ID", "주택 거래일", "주택 판매가격", "침실의 수", "화장실의 수", "주택 내부 생활공간 면적",
                    "가용 토지공간 면적", "주택 층수", "물가뷰", "전망 평가지수", "주택상태 평가지수", "전반적 주택품질 평가지수",
                    "지상 내부공간 면적", "지하 내부공간 면적", "주택 완공년도", "마지막 주택 보수년도", "지역 우편번호",
                    "위도", "경도", "인접 15개 가구 내부 생활공간 면적", "인접 15개 가구 가용 토지공간 면적"]

data_dict = {v:d for v, d in zip(housing_price.columns, description_list)}
col_list = df_train.columns

## 산점도
fig, axs = plt.subplots(5, 4, figsize = (12,12))

for i in range(5) :
    for j in range(4) :
        if (i*4+j >= 18) :
            break
            
        axs[i, j].scatter(df_train.iloc[:, i*4+j], df_train.price, s = 0.5, alpha = 0.3)
        axs[i, j].set_title(data_dict[col_list[i*4+j]])

        if col_list[i*4+j] in ("long", "sqft_lot15") :
            axs[i, j].tick_params(axis='x', rotation=-30)

axs[4, 2].remove()
axs[4, 3].remove()
fig.tight_layout()
plt.show()

`-` 구매 일자 관련 시각화

In [None]:
## 월별 주택 판매가격 상자 그림
fig = go.Figure()
fig.add_trace(
    go.Box(
        x = train_date[0].astype(int), y = df_train.price,
        notched=True
    )
)

fig.update_layout(title=dict(text="월별 주택 판매 가격에 대한 상자 그림", font=dict(size=30), x = 0.5, y = 0.95), height = 800)
fig.update_xaxes(dtick=1)
fig.show()

In [None]:
## 주택 판매일과 주택 가격 간 산점도
plt.scatter(train_date[1], df_train.price, s = 0.5, alpha = 0.3)
plt.title("일별 거래량과 가격 산점도")
plt.xlabel("Days(Starting from 2014-05-02)")
plt.ylabel("Price")
plt.show()

### **C. 실제 위치 별 주택 가격 시각화**

In [None]:
df_feature = df_train.loc[:, ["lat", "long", "price"]]

with open('/root/ML2024/hw/기말 발표/graphics/King County.geojson', 'r') as f:
    king_county_boundary = json.load(f)

fig = px.density_mapbox(
    data_frame = df_feature,
    lat = 'lat',
    lon = 'long',
    radius = 9,  ## 줌 스케일과 무관하게 크기가 상대적으로 설정됨
    center = {'lat' : 47.4421, 'lon' : -121.8089},
    z = 'price',  ## 색상으로 표시할 변수
    #---#
    mapbox_style = 'carto-positron',
    zoom = 8.9,
    width = 1200,
    height = 900
)

# King County 경계선 추가
fig.add_trace(
    go.Choroplethmapbox(
        geojson=king_county_boundary,
        locations=[feature['id'] for feature in king_county_boundary['features']], # geojson ID와 매핑
        z=[1] * len(king_county_boundary['features']), # 동일 값을 지정
        colorscale=[[0, "rgba(0,0,0,0)"], [1, "red"]], # 투명도 + 빨간색 경계
        showscale=False, # 컬러바 숨기기
        marker_opacity=0.2, # 경계 영역 투명도 설정
        marker_line_width=2 # 경계선 두께 설정
    )
)

fig.show(config = {'scrollZoom' : False})

### **D. 변수 간 선형 상관계수 히트맵(절대값 스케일)**

In [None]:
## generating tidy data
tidy_data = df_train.corr().stack().reset_index().rename({"level_0" : "var1", "level_1" : "var2", 0 : "correlation"}, axis = 1)\
.assign(abs_corr = lambda _df : _df.correlation.map(lambda x : abs(x)))
tidy_data = pd.concat([tidy_data.loc[tidy_data.var1 == 'bedrooms'][::-1], tidy_data.iloc[20:, :]], axis = 0)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = tidy_data.var1, y = tidy_data.var2, z = tidy_data.abs_corr,
        text = tidy_data.correlation, texttemplate="%{text:.2f}"
    )
)

fig.update_layout(title=dict(text="20개 변수 간 상관계수", font=dict(size=30), x = 0.5, y = 0.95), height = 800)
fig.show()

In [16]:
## 상관계수의 절대값이 0.7 이상인 변수들
tidy_data.loc[(tidy_data.abs_corr > 0.7) & (tidy_data.var1 != tidy_data.var2)]\
.assign(combine_set = lambda _df : (_df.var1 + " " + _df.var2).str.split().map(lambda x : set(x)))\
.loc[lambda _df : _df.combine_set.drop_duplicates().index].drop("combine_set", axis = 1)

Unnamed: 0,var1,var2,correlation,abs_corr,combine_set
22,bathrooms,sqft_living,0.756053,0.756053,"{sqft_living, bathrooms}"
48,sqft_living,grade,0.764164,0.764164,"{sqft_living, grade}"
49,sqft_living,sqft_above,0.878998,0.878998,"{sqft_living, sqft_above}"
56,sqft_living,sqft_living15,0.753548,0.753548,"{sqft_living, sqft_living15}"
59,sqft_living,price,0.701565,0.701565,"{sqft_living, price}"
77,sqft_lot,sqft_lot15,0.710831,0.710831,"{sqft_lot15, sqft_lot}"
169,grade,sqft_above,0.755626,0.755626,"{grade, sqft_above}"
176,grade,sqft_living15,0.713762,0.713762,"{sqft_living15, grade}"
196,sqft_above,sqft_living15,0.728683,0.728683,"{sqft_living15, sqft_above}"


## **4. 다중선형회귀모형 적용**

In [None]:
#-----일별 분석-----
## 1. data : 일별 분석
X = df_train.drop(["date", "price"], axis = 1).assign(days = train_date[1])
y = df_train.price

XX = df_test.drop(["date", "price"], axis = 1).assign(days = test_date[1])
yy = df_test.price

## 2. predictor
predictr = sklearn.linear_model.LinearRegression()
predictr.fit(X, y)

## 3. prediction
yyhat = predictr.predict(XX)

## 4. evaluation
scoring_dict["Linear Regression with days"] = np.mean((yy - yyhat)**2)**0.5
print(f"RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

#-----월별 분석-----
## 1. data
X = pd.get_dummies(df_train.drop(["date", "price"], axis = 1).assign(month = train_date[0]), drop_first = True, dtype = int)
y = df_train.price

XX = pd.get_dummies(df_test.drop(["date", "price"], axis = 1).assign(month = test_date[0]), drop_first = True, dtype = int)
yy = df_test.price


## 2. predictor
predictr = sklearn.linear_model.LinearRegression()
predictr.fit(X, y)

## 3. prediction
yyhat = predictr.predict(XX)

## 4. evaluation
scoring_dict["Linear Regression with month"] = np.mean((yy - yyhat)**2)**0.5
print(f"RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

## ANOVA table
model = sm.OLS(y, pd.concat([pd.DataFrame({'intercept' : np.ones(X.shape[0])}), X], axis = 1))
results = model.fit()

print(results.summary().tables[1])

## **5. 벌점 함수 모형 적용**

### **A. Ridge**

`-` 변수 스케일링 / 하이퍼 파라미터 튜닝

In [25]:
##------범주형 반응변수는 전부 스케일링 하지 않음------

## fitting scaler with train set
numeric_features = list(set(df_train.columns) - set(["price", "date", "waterfront"])) ## date : 선형이므로 월별로 설정(더미변수 처리)
scalr = StandardScaler()
y_scalr = PowerTransformer()
standarized_features = scalr.fit_transform(df_train.loc[:, numeric_features])

## train data
X_trans = np.concatenate([standarized_features, np.asarray(pd.get_dummies(train_date[0], drop_first = True, dtype = int).assign(waterfront = df_train.waterfront))], axis = 1) # features
y_trans = np.asarray(y_scalr.fit_transform(pd.DataFrame(df_train.price))).reshape(-1) ## scaled target

## test data
XX_trans = np.concatenate([scalr.transform(df_test.loc[:, numeric_features]), np.asarray(pd.get_dummies(test_date[0], drop_first = True, dtype = int).assign(waterfront = df_test.waterfront))], axis = 1)
yy_trans = np.asarray(y_scalr.transform(pd.DataFrame(df_test.price))).reshape(-1)


## fitting and cross validation(10-fold)
lambdas = 10**np.linspace(1, -5, 100) ## setting grid for plotting
kfold = KFold(10, random_state = 14107, shuffle = True)
ridgeCV = sklearn.linear_model.ElasticNetCV(alphas = lambdas, l1_ratio = 0, cv = kfold)
ridgeCV.fit(X_trans, y)

## fitting and cross validation(10-fold)
lambdas = 10**np.linspace(1, -5, 100) ## setting grid for plotting
kfold = KFold(10, random_state = 14107, shuffle = True)
ridgeCV_trans = sklearn.linear_model.ElasticNetCV(alphas = lambdas, l1_ratio = 0, cv = kfold)
ridgeCV_trans.fit(X_trans, y_trans)

##----------prediction----------

## CV optimized predictor : raw price
predictr_optim = sklearn.linear_model.ElasticNet(alpha = ridgeCV.alpha_, l1_ratio = 0)
predictr_optim.fit(X_trans, y)
scoring_dict["Ridge Regression"] = np.mean((yy - predictr_optim.predict(XX_trans))**2)**0.5
print(f"raw price RMSE = {np.mean((yy - predictr_optim.predict(XX_trans))**2)**0.5:.4f}")

## CV optimized predictor : transformed price
predictr_optim_trans = sklearn.linear_model.ElasticNet(alpha = ridgeCV_trans.alpha_, l1_ratio = 0)
predictr_optim_trans.fit(X_trans, y_trans)
scoring_dict["Ridge Regression with transformed price"] = np.mean((yy - y_scalr.inverse_transform(predictr_optim.predict(XX_trans).reshape(-1,1)).reshape(-1))**2)**0.5
print(f"transformed price RMSE = {np.mean((yy - y_scalr.inverse_transform(predictr_optim.predict(XX_trans).reshape(-1,1)).reshape(-1))**2)**0.5:.4f}")

## 1-se-rule predictor : raw price
mse_list = ridgeCV.mse_path_.mean(1)
indx = np.where(mse_list == np.min(mse_list))
min_mse = mse_list[indx]
min_std = ridgeCV.mse_path_.std(1)[indx]
simple_indx = np.min(np.where(mse_list <= min_mse + min_std / np.sqrt(10)))
predictr_1se = sklearn.linear_model.ElasticNet(alpha = ridgeCV.alphas_[simple_indx], l1_ratio = 0)
predictr_1se.fit(X_trans, y)
yyhat = predictr_1se.predict(XX_trans)
scoring_dict["Ridge Regression with 1-se rule"] = np.mean((yy - yyhat)**2)**0.5
print(f"raw price 1se rule RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

## 1-se-rule predictor : transformed price
mse_list_trans = ridgeCV_trans.mse_path_.mean(1)
indx_trans = np.where(mse_list_trans == np.min(mse_list_trans))
min_mse_trans = mse_list_trans[indx_trans]
min_std_trans = ridgeCV_trans.mse_path_.std(1)[indx]
simple_indx_trans = np.min(np.where(mse_list_trans <= min_mse_trans + min_std_trans / np.sqrt(10)))
predictr_1se = sklearn.linear_model.ElasticNet(alpha = ridgeCV_trans.alphas_[simple_indx_trans], l1_ratio = 0)
predictr_1se.fit(X_trans, y_trans)
yyhat = y_scalr.inverse_transform(predictr_1se.predict(XX_trans).reshape(-1,1)).reshape(-1)
scoring_dict["Ridge Regression with transformed price & 1-se rule"] = np.mean((yy - yyhat)**2)**0.5
print(f"transformed price 1se rule RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

raw price RMSE = 196381.8609
transformed price RMSE = 221208.7680
raw price 1se rule RMSE = 200062.3972
transformed price 1se rule RMSE = 240907.1497


`-` 시각화 및 $\lambda$값 식별

In [None]:
## plotting
ridgeCV_fig, axs = plt.subplots(1, 2, figsize=(16,8))
axs[0].errorbar(-np.log(ridgeCV.alphas_), ridgeCV.mse_path_.mean(1),
            yerr=ridgeCV.mse_path_.std(1) / np.sqrt(10))
axs[0].axvline(-np.log(ridgeCV.alpha_), c='k', ls='--')
axs[0].axhline(min_mse + min_std / np.sqrt(10), c = 'k', ls = '--')
axs[0].set_xlabel('$-\log(\lambda)$', fontsize=20)
axs[0].set_ylabel('Cross-validated MSE', fontsize=20)
axs[0].scatter(-np.log(ridgeCV.alphas_[simple_indx]), mse_list[simple_indx],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
axs[0].set_title("반응변수 변환이 없는 경우 CV MSE와 그 표준오차")
axs[0].legend()

axs[1].errorbar(-np.log(ridgeCV_trans.alphas_), ridgeCV_trans.mse_path_.mean(1),
            yerr=ridgeCV_trans.mse_path_.std(1) / np.sqrt(10))
axs[1].axvline(-np.log(ridgeCV_trans.alpha_), c='k', ls='--')
axs[1].axhline(min_mse_trans + min_std_trans / np.sqrt(10), c = 'k', ls = '--')
axs[1].set_xlabel('$-\log(\lambda)$', fontsize=20)
axs[1].set_ylabel('Cross-validated MSE', fontsize=20)
axs[1].scatter(-np.log(ridgeCV_trans.alphas_[simple_indx_trans]), mse_list_trans[simple_indx_trans],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
axs[1].set_title("반응응변수를 파워변환 한 경우 CV MSE와 그 표준오차")
axs[1].legend()

plt.show()

## showing parameter
print(f'선택된 lambda(raw) = {ridgeCV.alpha_ : .5f}, {ridgeCV.alphas_[simple_indx] : .5f}')
print(f"선택된 lambda(powertransform) = {ridgeCV_trans.alpha_ : .5f}, {ridgeCV_trans.alphas_[simple_indx_trans] : .5f}")

`-` 범주형 반응변수까지 전부 스케일링하는 경우(성능 진짜 조금 감소)

In [17]:
##------범주형 반응변수까지 전부 스케일링함------

##----------data----------

## fitting scaler with train set
numeric_features = list(set(df_train.columns) - set(["price", "date"])) ## date : 선형이므로 월별로 설정(더미변수 처리)
scalr = StandardScaler()

## train data
X_trans = scalr.fit_transform(pd.concat([df_train.loc[:, numeric_features], pd.get_dummies(train_date[0], drop_first = True, dtype = int)], axis = 1))

## test data
XX_trans = scalr.transform(pd.concat([df_test.loc[:, numeric_features], pd.get_dummies(test_date[0], drop_first = True, dtype = int)], axis = 1))

##----------tuning----------

## fitting and cross validation(10-fold)
lambdas = 10**np.linspace(2, -4, 100) ## setting grid for plotting
kfold = KFold(10, random_state = 14107, shuffle = True)
ridgeCV = sklearn.linear_model.ElasticNetCV(alphas = lambdas, l1_ratio = 0, cv = kfold)
ridgeCV.fit(X_trans, y)

## fitting and cross validation(10-fold)
lambdas = 10**np.linspace(3, -5, 100) ## setting grid for plotting
kfold = KFold(10, random_state = 14107, shuffle = True)
ridgeCV_trans = sklearn.linear_model.ElasticNetCV(alphas = lambdas, l1_ratio = 0, cv = kfold)
ridgeCV_trans.fit(X_trans, y_trans)

##----------prediction----------

## CV optimized predictor : raw price
predictr_optim = sklearn.linear_model.ElasticNet(alpha = ridgeCV.alpha_, l1_ratio = 0)
predictr_optim.fit(X_trans, y)
print(f"raw price RMSE = {np.mean((yy - predictr_optim.predict(XX_trans))**2)**0.5:.4f}")

## CV optimized predictor : transformed price
predictr_optim_trans = sklearn.linear_model.ElasticNet(alpha = ridgeCV_trans.alpha_, l1_ratio = 0)
predictr_optim_trans.fit(X_trans, y_trans)
print(f"transformed price RMSE = {np.mean((yy - y_scalr.inverse_transform(predictr_optim.predict(XX_trans).reshape(-1,1)).reshape(-1))**2)**0.5:.4f}")

## 1-se-rule predictor : raw price
mse_list = ridgeCV.mse_path_.mean(1)
indx = np.where(mse_list == np.min(mse_list))
min_mse = mse_list[indx]
min_std = ridgeCV.mse_path_.std(1)[indx]
simple_indx = np.min(np.where(mse_list <= min_mse + min_std / np.sqrt(10)))
predictr_1se = sklearn.linear_model.ElasticNet(alpha = ridgeCV.alphas_[simple_indx], l1_ratio = 0)
predictr_1se.fit(X_trans, y)
yyhat = predictr_1se.predict(XX_trans)
scoring_dict["Ridge Regression with 1-se rule"] = np.mean((yy - yyhat)**2)**0.5
print(f"raw price 1se rule RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

## 1-se-rule predictor : transformed price
mse_list_trans = ridgeCV_trans.mse_path_.mean(1)
indx_trans = np.where(mse_list_trans == np.min(mse_list_trans))
min_mse_trans = mse_list_trans[indx_trans]
min_std_trans = ridgeCV_trans.mse_path_.std(1)[indx]
simple_indx_trans = np.min(np.where(mse_list_trans <= min_mse_trans + min_std_trans / np.sqrt(10)))
predictr_1se = sklearn.linear_model.ElasticNet(alpha = ridgeCV_trans.alphas_[simple_indx_trans], l1_ratio = 0)
predictr_1se.fit(X_trans, y_trans)
yyhat = y_scalr.inverse_transform(predictr_1se.predict(XX_trans).reshape(-1,1)).reshape(-1)
scoring_dict["Ridge Regression with transformed price & 1-se rule"] = np.mean((yy - yyhat)**2)**0.5
print(f"transformed price 1se rule RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

raw price RMSE = 196385.8038
transformed price RMSE = 218913.7806
raw price 1se rule RMSE = 202735.1071
transformed price 1se rule RMSE = 283355.9570


### **B. Lasso**

`-` 변수 스케일링/하이퍼 파라미터 튜닝

In [None]:
## fitting scaler with train set
numeric_features = list(set(df_train.columns) - set(["price", "date", "waterfront"])) ## date : 선형이므로 월별로 설정(더미변수 처리)
scalr = StandardScaler()
y_scalr = PowerTransformer()
standarized_features = scalr.fit_transform(df_train.loc[:, numeric_features])

## train data
X_trans = np.concatenate([standarized_features, np.asarray(pd.get_dummies(train_date[0], drop_first = True, dtype = int).assign(waterfront = df_train.waterfront))], axis = 1) # features
y_trans = np.asarray(y_scalr.fit_transform(pd.DataFrame(df_train.price))).reshape(-1) ## scaled target

## test data
XX_trans = np.concatenate([scalr.transform(df_test.loc[:, numeric_features]), np.asarray(pd.get_dummies(test_date[0], drop_first = True, dtype = int).assign(waterfront = df_test.waterfront))], axis = 1)
yy_trans = np.asarray(y_scalr.transform(pd.DataFrame(df_test.price))).reshape(-1)


## fitting and cross validation(10-fold)
lambdas = 10**np.linspace(1, -5, 100) ## setting grid for plotting
kfold = KFold(10, random_state = 14107, shuffle = True)
LassoCV = sklearn.linear_model.ElasticNetCV(alphas = lambdas, l1_ratio = 0, cv = kfold)
LassoCV.fit(X_trans, y)

## fitting and cross validation(10-fold)
lambdas = 10**np.linspace(0, -5, 100) ## setting grid for plotting
kfold = KFold(10, random_state = 14107, shuffle = True)
LassoCV_trans = sklearn.linear_model.ElasticNetCV(alphas = lambdas, l1_ratio = 0, cv = kfold)
LassoCV_trans.fit(X_trans, y_trans)

##----------prediction----------

## CV optimized predictor : raw price
predictr_optim = sklearn.linear_model.ElasticNet(alpha = LassoCV.alpha_, l1_ratio = 0)
predictr_optim.fit(X_trans, y)
scoring_dict["Lasso Regression"] = np.mean((yy - predictr_optim.predict(XX_trans))**2)**0.5
print(f"raw price RMSE = {np.mean((yy - predictr_optim.predict(XX_trans))**2)**0.5:.4f}")

## CV optimized predictor : transformed price
predictr_optim_trans = sklearn.linear_model.ElasticNet(alpha = LassoCV_trans.alpha_, l1_ratio = 0)
predictr_optim_trans.fit(X_trans, y_trans)
scoring_dict["Lasso Regression with transformed price"] = np.mean((yy - y_scalr.inverse_transform(predictr_optim.predict(XX_trans).reshape(-1,1)).reshape(-1))**2)**0.5
print(f"transformed price RMSE = {np.mean((yy - y_scalr.inverse_transform(predictr_optim.predict(XX_trans).reshape(-1,1)).reshape(-1))**2)**0.5:.4f}")

## 1-se-rule predictor : raw price
mse_list = LassoCV.mse_path_.mean(1)
indx = np.where(mse_list == np.min(mse_list))
min_mse = mse_list[indx]
min_std = LassoCV.mse_path_.std(1)[indx]
simple_indx = np.min(np.where(mse_list <= min_mse + min_std / np.sqrt(10)))
predictr_1se = sklearn.linear_model.ElasticNet(alpha = LassoCV.alphas_[simple_indx], l1_ratio = 0)
predictr_1se.fit(X_trans, y)
yyhat = predictr_1se.predict(XX_trans)
scoring_dict["Lasso Regression with 1-se rule"] = np.mean((yy - yyhat)**2)**0.5
print(f"raw price 1se rule RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

## 1-se-rule predictor : transformed price
mse_list_trans = LassoCV_trans.mse_path_.mean(1)
indx_trans = np.where(mse_list_trans == np.min(mse_list_trans))
min_mse_trans = mse_list_trans[indx_trans]
min_std_trans = LassoCV_trans.mse_path_.std(1)[indx]
simple_indx_trans = np.min(np.where(mse_list_trans <= min_mse_trans + min_std_trans / np.sqrt(10)))
predictr_1se = sklearn.linear_model.ElasticNet(alpha = LassoCV_trans.alphas_[simple_indx_trans], l1_ratio = 0)
predictr_1se.fit(X_trans, y_trans)
yyhat = y_scalr.inverse_transform(predictr_1se.predict(XX_trans).reshape(-1,1)).reshape(-1)
scoring_dict["Lasso Regression with transformed price & 1-se rule"] = np.mean((yy - yyhat)**2)**0.5
print(f"transformed price 1se rule RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

raw price RMSE = 196381.7875
transformed price RMSE = 221208.6226
raw price 1se rule RMSE = 200062.3972
transformed price 1se rule RMSE = 241576.0184


`-` 시각화

In [None]:
## plotting
LassoCV_fig, axs = plt.subplots(1, 2, figsize=(16,8))
axs[0].errorbar(-np.log(LassoCV.alphas_), LassoCV.mse_path_.mean(1),
            yerr=LassoCV.mse_path_.std(1) / np.sqrt(10))
axs[0].axvline(-np.log(LassoCV.alpha_), c='k', ls='--')
axs[0].axhline(min_mse + min_std / np.sqrt(10), c = 'k', ls = '--')
axs[0].set_xlabel('$-\log(\lambda)$', fontsize=20)
axs[0].set_ylabel('Cross-validated MSE', fontsize=20)
axs[0].scatter(-np.log(LassoCV.alphas_[simple_indx]), mse_list[simple_indx],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
axs[0].set_title("반응변수 변환이 없는 경우 CV MSE와 그 표준오차")
axs[0].legend()

axs[1].errorbar(-np.log(LassoCV_trans.alphas_), LassoCV_trans.mse_path_.mean(1),
            yerr=LassoCV_trans.mse_path_.std(1) / np.sqrt(10))
axs[1].axvline(-np.log(LassoCV_trans.alpha_), c='k', ls='--')
axs[1].axhline(min_mse_trans + min_std_trans / np.sqrt(10), c = 'k', ls = '--')
axs[1].set_xlabel('$-\log(\lambda)$', fontsize=20)
axs[1].set_ylabel('Cross-validated MSE', fontsize=20)
axs[1].scatter(-np.log(LassoCV_trans.alphas_[simple_indx_trans]), mse_list_trans[simple_indx_trans],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
axs[1].set_title("반응변수를 파워변환 한 경우 CV MSE와 그 표준오차")
axs[1].legend()

plt.show()

## showing parameter
print(f'선택된 lambda(raw) = {LassoCV.alpha_ : .5f}, {LassoCV.alphas_[simple_indx] : .5f}')
print(f"선택된 lambda(powertransform) = {LassoCV_trans.alpha_ : .5f}, {LassoCV_trans.alphas_[simple_indx_trans] : .5f}")

### **C. 선형 모형 간 성능 비교**

In [None]:
linear_dict = {k:float(v) for k, v in scoring_dict.items() if (("Linear" in k) or ("Ridge" in k) or ("Lasso" in k)) and ("Reduction" not in k)}
df_score = pd.DataFrame({"method" : linear_dict.keys(), "score" : linear_dict.values()}).iloc[[1,0,2,4,3,5,6,8,7,9]]

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = df_score.score,
        y = df_score.method,
        text = df_score.score,
        orientation = "h",
        texttemplate = "%{text:.4f}",
        marker_line=dict(width=5, color='black')
    )
)

fig.update_layout(height = 800, title = dict(text="선형 모형 적합결과의 비교(RMSE)", font=dict(size=30), x = 0.5, y = 0.95))
fig["data"][0]["marker"]["color"] = ["skyblue"]*2 + ["green"]*4 + ["blue"]*4
fig["data"][0]["marker"]["line"]["color"] = ["red", "skyblue"] + ["red"] + ["green"]*3 + ["red"] + ["blue"]*3

fig.show()

## **6. 비선형 모형 적용**

`-` 시각화 및 $\lambda$ 탐색

In [None]:
## data
X = df_train.drop(["date", "price"], axis = 1).assign(days = train_date[1]) ## 일별 자료
y = df_train.price

XX = df_test.drop(["date", "price"], axis = 1).assign(days = test_date[1])
yy = df_test.price

## lambda grid
lams = 10**np.linspace(-1, 4, 50)
kfold = KFold(5, random_state = 14107, shuffle = True)

## 초기화
mean_scores = []
std_scores = []

## CV
for i, lam in enumerate(lams):
    scores = []
    
    for train_idx, valid_idx in kfold.split(X):
        X_train, X_valid = X.to_numpy()[train_idx], X.to_numpy()[valid_idx]
        y_train, y_valid = y.to_numpy()[train_idx], y.to_numpy()[valid_idx]

        gam = LinearGAM(lam=lam).fit(X_train, y_train)
        yyhat = gam.predict(X_valid)
        rmse = np.mean((y_valid - yyhat)**2)**0.5
        scores.append(rmse)

    mean_scores.append(np.mean(scores))
    std_scores.append(np.std(scores))

    print(f"{i} cycle rooped")

In [None]:
## optimized value
indx = np.where(mean_scores == np.min(mean_scores))[0][0]
min_mse = mean_scores[indx]
min_std = std_scores[indx]
simple_indx = np.max(np.where(mean_scores <= min_mse + min_std / np.sqrt(5)))

## plotting
fig, ax = plt.subplots(figsize=(8,8))
ax.errorbar(-np.log(lams), mean_scores, yerr = std_scores/np.sqrt(5))
ax.axvline(-np.log(lams[indx]), c='k', ls='--')
ax.axhline(min_mse + min_std / np.sqrt(5), c = 'k', ls = '--')
ax.set_xlabel('$-\log(\lambda)$', fontsize=20)
ax.set_ylabel('Cross-validated MSE', fontsize=20)
ax.scatter(-np.log(lams[simple_indx]), mean_scores[simple_indx],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
ax.set_title(f"GAM에서 $\lambda$ 값을 변환시켜갈 때의 CV MSE")

plt.legend()
plt.show()

## optimal lambda
optim_lambda = lams[indx]
optim_1se_lambda = lams[simple_indx]
print(f"Optimal : {optim_lambda:.4f}")
print(f"1-se rule : {optim_1se_lambda:.4f}")

In [67]:
## test RMSE 산출
gam = LinearGAM(lam = optim_lambda)
gam.fit(X, y)
scoring_dict["GAM"] = np.mean((yy - gam.predict(XX))**2)**0.5
print(f"RMSE = {np.mean((yy - gam.predict(XX))**2)**0.5:.4f}")

gam = LinearGAM(lam = optim_1se_lambda)
gam.fit(X, y)
scoring_dict["GAM with 1-se rule"] = np.mean((yy - gam.predict(XX))**2)**0.5
print(f"RMSE = {np.mean((yy - gam.predict(XX))**2)**0.5:.4f}")

RMSE = 156864.5731
RMSE = 165829.6970


`-` 범주형 설명변수 `waterfront`를 따로 처리하는 경우(성능이 더 떨어졌음.)

In [None]:
## lambda grid
lams = 10**np.linspace(-1, 4, 50)
kfold = KFold(5, random_state = 14107, shuffle = True)

## 초기화
mean_scores = []
std_scores = []

## CV
for i, lam in enumerate(lams):
    scores = []
    
    for train_idx, valid_idx in kfold.split(X):
        X_train, X_valid = X_trans[train_idx], X_trans[valid_idx]
        y_train, y_valid = y.to_numpy()[train_idx], y.to_numpy()[valid_idx]

        ## 6번째 변수, waterfront를 범주형으로 처리
        gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + f(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11) + s(12) + s(13) + s(14) + s(15) + s(16) + s(17) + s(18), lam = lam)
        gam.fit(X_train, y_train)
        yyhat = gam.predict(X_valid)
        rmse = np.mean((y_valid - yyhat)**2)**0.5
        scores.append(rmse)

    mean_scores.append(np.mean(scores))
    std_scores.append(np.std(scores))

    print(f"{i} cycle rooped")

In [None]:
## optimized value
indx = np.where(mean_scores == np.min(mean_scores))[0][0]
min_mse = mean_scores[indx]
min_std = std_scores[indx]
simple_indx = np.max(np.where(mean_scores <= min_mse + min_std / np.sqrt(5)))

## plotting
fig, ax = plt.subplots(figsize=(8,8))
ax.errorbar(-np.log(lams), mean_scores, yerr = std_scores/np.sqrt(5))
ax.axvline(-np.log(lams[indx]), c='k', ls='--')
ax.axhline(min_mse + min_std / np.sqrt(5), c = 'k', ls = '--')
ax.set_xlabel('$-\log(\lambda)$', fontsize=20)
ax.set_ylabel('Cross-validated MSE', fontsize=20)
ax.scatter(-np.log(lams[simple_indx]), mean_scores[simple_indx],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
ax.set_title(f"GAM에서 $\lambda$ 값을 변환시켜갈 때의 CV MSE")

plt.legend()
plt.show()

## optimal lambda
optim_lambda = lams[indx]
optim_1se_lambda = lams[simple_indx]
print(f"Optimal : {optim_lambda:.4f}")
print(f"1-se rule : {optim_1se_lambda:.4f}")

In [58]:
## test RMSE 산출
gam = LinearGAM(lam = optim_lambda)
gam.fit(X, y)
print(f"RMSE = {np.mean((yy - gam.predict(XX))**2)**0.5:.4f}")

gam = LinearGAM(lam = optim_1se_lambda)
gam.fit(X, y)
print(f"RMSE = {np.mean((yy - gam.predict(XX))**2)**0.5:.4f}")

RMSE = 159278.7315
RMSE = 163501.3002


## **7. 나무 모형**

`-` 훈련 데이터에서 CV로 최적의 prunning을 선택

In [None]:
## data
X = df_train.drop(["date", "price"], axis = 1).assign(days = train_date[1]) ## 일별 자료
y = df_train.price

XX = df_test.drop(["date", "price"], axis = 1).assign(days = test_date[1])
yy = df_test.price

## predictor
predictr = DecisionTreeRegressor() ## criterion = squared_error
ccp_path = predictr.cost_complexity_pruning_path(X, y)
kfold = KFold(5, random_state = 14107, shuffle = True)

validatr = GridSearchCV(predictr, {'ccp_alpha' : ccp_path.ccp_alphas},
                        refit = True, cv = kfold, scoring = "neg_root_mean_squared_error", verbose = 2)
validatr.fit(X, y)

In [20]:
best_predictr = validatr.best_estimator_
scoring_dict["Tree CV"] = np.mean((yy - best_predictr.predict(XX))**2)**0.5
print(f"RMSE = {np.mean((yy - best_predictr.predict(XX))**2)**0.5}")
print(f"number of leaves = {best_predictr.get_n_leaves()}")
print(f"best param : {validatr.best_params_}")

RMSE = 172828.85210206837
number of leaves = 436
best param : {'ccp_alpha': 11967239.867225198}


`-` 최적모형과 제일 복잡한 모형과의 성능지표 비교

In [None]:
full_predictr = DecisionTreeRegressor()
full_predictr.fit(X, y)
rmse = np.mean((yy - full_predictr.predict(XX))**2)**0.5

df_score = pd.DataFrame({"Method" : ["Tree CV", "Full Tree", "GAM"], "score" : [scoring_dict["Tree CV"], rmse, scoring_dict["GAM"]]})

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = df_score.score,
        y = df_score.Method,
        text = df_score.score,
        texttemplate = "%{text:.4f}",
        orientation = "h",
        marker_line=dict(width=5, color='black')
    )
)

fig.update_yaxes(categoryorder = "total descending")
fig["data"][0]["marker"]["color"] = ["skyblue", "skyblue", "skyblue"]
fig["data"][0]["marker"]["line"]["color"] = ["red", "skyblue", "skyblue"]

fig.show()

`-` 적합된 트리 모형 시각화(max_depth = 3)

In [None]:
## data
X = df_train.drop(["date", "price"], axis = 1).assign(days = train_date[1]) ## 일별 자료
y = df_train.price

XX = df_test.drop(["date", "price"], axis = 1).assign(days = test_date[1])
yy = df_test.price

predictr_for_viz = DecisionTreeRegressor(max_depth = 3)
predictr_for_viz.fit(X, y)
viz = dtreeviz.model(predictr_for_viz, X, y, target_name = "price", feature_names = X.columns)
viz.view(fontname = "NanumGothic")

## **8. 부스팅**

In [None]:
## data
X = df_train.drop(["date", "price"], axis = 1).assign(days = train_date[1]) ## 일별 자료
y = df_train.price

XX = df_test.drop(["date", "price"], axis = 1).assign(days = test_date[1])
yy = df_test.price

## params
fit_params = {
    'early_stopping_rounds': 10, ## 성능 개선이 없을 시 종료 라운드
    'verbose': False
}

search_space = {
    'learning_rate': (0.001, 0.1), ## 학습률
    'n_estimators': (100, 1000), ## 트리 수
    'gamma': (1e-1, 10000, "log-uniform"), ## 노드 분할 시 최소 손실 감소량
    'max_depth': (1, 10), ## 트리 깊이
    'min_child_weight': (1e-2, 100, "log-uniform"), ## 헤시안의 최소값
    'colsample_bytree': (0.2, 0.7), ## 각 트리 구성 시 사용하는 열의 비율
    'subsample': (0.5, 1.0), ## 데이터 서브 샘플링
    'sampling_method': ["uniform", "gradient_based"], ## 서브 샘플링 방법
    'lambda': (1e-1, 10000, "log-uniform"), ## 가중치 L2 표준화 계수
    'alpha': (1e-2, 1000, "log-uniform") ## 가중치 L1 표준화 계수
}

kfold = KFold(4, random_state = 14107, shuffle = True)

## optimizr
predictr = xgb.XGBRegressor(tree_method = "gpu_hist", device = "cuda")
optimizr = BayesSearchCV(
    estimator=predictr,
    search_spaces=search_space,
    fit_params=fit_params,
    cv=kfold,
    scoring="neg_mean_squared_error",
    random_state=14107,
    verbose=1
)

optimizr.fit(X, y)

In [None]:
## best parameter set 확인
# print(optimizr.best_estimator_.get_params())

## rmse 계산
best_predictr = xgb.XGBRegressor(**optimizr.best_estimator_.get_params())
best_predictr.fit(X, y)
scoring_dict["XGBoost CV"] = np.mean((yy - best_predictr.predict(XX))**2)**0.5
print(f"RMSE = {np.mean((yy - best_predictr.predict(XX))**2)**0.5}")

RMSE = 107541.7772014463


`-` Features importance 산출 및 시각화

In [None]:
importances = pd.DataFrame({"features" : best_predictr.feature_names_in_, "importances" : best_predictr.feature_importances_})

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = importances.importances,
        y = importances.features,
        text = importances.importances,
        texttemplate = "%{text:.4f}",
        orientation = "h"
    )
)

fig.update_layout(width = 1200, height = 900, title=dict(text="최적 모형의 Feature Importances", font=dict(size=30), x = 0.5, y = 0.95))
fig.update_yaxes(categoryorder = "total ascending")

fig.show()

## **9. SVM**

`-` 첫 번째 그리드 서치

In [None]:
## 반응변수 구간 분할
X = df_train[["long", "lat"]]
y = df_train.price.map(lambda x : x >= df_train.price.median()).astype(int)

XX = df_test[["long", "lat"]]
yy = df_test.price.map(lambda x : x >= df_train.price.median()).astype(int)

## hyperparameter tuning
svm_rbf = SVC()
kfold = KFold(5, random_state=0, shuffle=True)

grid = GridSearchCV(svm_rbf, {'C':[0.1,1,10,100,1000], 'gamma':[0.5,1,2,3,4]}, refit=True, cv=kfold, scoring='accuracy')
grid.fit(X, y)
print(grid.best_params_)

## evaluation
best_svm = grid.best_estimator_

{'C': 1000, 'gamma': 4}

`-` 두 번째 그리드 서치

In [None]:
## hyperparameter tuning
svm_rbf = SVC()
kfold = KFold(5, random_state=0, shuffle=True)

grid = GridSearchCV(svm_rbf, {'C': [1000, 5000, 10000], 'gamma': 10**np.linspace(0, 3, 5)}, refit=True, cv=kfold, scoring='accuracy')
grid.fit(X, y)
print(grid.best_params_)

## evaluation
best_svm = grid.best_estimator_
yyhat = best_svm.predict(XX)
confusion_matrix(yyhat, yy)

{'C': 10000, 'gamma': 1000.0}

`-` 시각화

In [None]:
## 시각화 1 : c = 1000
gammas = [1, 10, 100, 1000]
models = [SVC(kernel='rbf', gamma = i, C = 1000) for i in gammas]
models = [clf.fit(X, y) for clf in models]
titles = [f"gamma = {i}" for i in gammas]

fig, axs = plt.subplots(1, 4, figsize = (16, 3.5))

for clf, title, ax in zip(models, titles, axs.flatten()) :
        disp = DecisionBoundaryDisplay.from_estimator(
                clf,
                X,
                response_method="predict",
                cmap=plt.cm.coolwarm,
                alpha=0.8,
                ax=ax
        )

        ax.scatter(X.long, X.lat, c = y, cmap = plt.cm.coolwarm, s = 20, edgecolors = "k")
        ax.set_xlim([X.long.min(), X.long.max()])
        ax.set_ylim([X.lat.min(), X.lat.max()])
        ax.set_title(title)

In [None]:
## 시각화 2 : c = 10000
gammas = [1, 10, 100, 1000]
models = [SVC(kernel='rbf', gamma = i, C = 10000) for i in gammas]
models = [clf.fit(X, y) for clf in models]
titles = [f"gamma = {i}" for i in gammas]

fig, axs = plt.subplots(1, 4, figsize = (16, 3.5))

for clf, title, ax in zip(models, titles, axs.flatten()) :
        disp = DecisionBoundaryDisplay.from_estimator(
                clf,
                X,
                response_method="predict",
                cmap=plt.cm.coolwarm,
                alpha=0.8,
                ax=ax
        )

        ax.scatter(X.long, X.lat, c = y, cmap = plt.cm.coolwarm, s = 20, edgecolors = "k")
        ax.set_xlim([X.long.min(), X.long.max()])
        ax.set_ylim([X.lat.min(), X.lat.max()])
        ax.set_title(title)
        
fig.show()

## **10. 비지도학습 : PCA**

`-` 위도/경도를 KPCA 해본 결과 시각화(components = 2)

In [None]:
gammas = np.logspace(-3, 4, 8)
fig = make_subplots(rows = 2, cols = 4, subplot_titles = [f"gamma = {gamma}" for gamma in gammas])

for i, gamma in enumerate(gammas) :
    kpca = KernelPCA(kernel = "rbf", gamma = gamma, n_components = 2)
    trans_location = kpca.fit_transform(X[["long", "lat"]])
    
    fig.add_trace(
        go.Scatter(
            x = trans_location[:, 0],
            y = trans_location[:, 1],
            marker_color = df_train.price,
            mode = "markers",
            marker=dict(size=2),
            marker_colorscale = "bluyl"
        ),
        row = 1+i//4, col = 1+i%4
    )
    
fig.show() ## 망함, 애초에 구조가 애매함

`-` 중심지(최대 거래가격)와의 거리를 설명변수로 설정

In [None]:
## 변환
central_location = df_train.loc[df_train.price == df_train.price.max(), ["long", "lat"]]
central_long, central_lat = central_location.iloc[0, 0], central_location.iloc[0, 1]
location = df_train[["long", "lat"]]

df_distance = location.assign(long_sq = location.long.map(lambda x : (x-central_long)**2)).assign(lat_sq = location.lat.map(lambda x : (x-central_lat)**2))\
    .assign(distance = lambda _df : (_df.long_sq + _df.lat_sq)**0.5)

## 시각화
fig = make_subplots(rows = 1, cols = 2, subplot_titles = ["distance", "price"])

fig.add_trace(
    go.Scatter(
        x = df_distance.long,
        y = df_distance.lat,
        marker_color = -df_distance.distance, ## 색상 통일시키기 위해 음수로
        mode = "markers",
        marker=dict(size=2),
        marker_colorscale = "bluyl"
    ),
    row = 1, col = 1
)

fig.add_trace(
    go.Scatter(
        x = df_distance.long,
        y = df_distance.lat,
        marker_color = df_train.price,
        mode = "markers",
        marker=dict(size=2),
        marker_colorscale = "bluyl"
    ),
    row = 1, col = 2
)

fig.show()

In [41]:
## 설명변수 차원 축소(위도/경도 -> 중심으로부터의 거리)
X = pd.get_dummies(df_train.drop(["date", "price", "lat", "long"], axis = 1).assign(month = train_date[0]).assign(distance = (df_train.long.map(lambda x : (x - central_long)**2) + df_train.lat.map(lambda x : (x - central_lat)**2))**0.5), drop_first = True, dtype = int)
y = df_train.price

XX = pd.get_dummies(df_test.drop(["date", "price", "lat", "long"], axis = 1).assign(month = test_date[0]).assign(distance = (df_test.long.map(lambda x : (x - central_long)**2) + df_test.lat.map(lambda x : (x - central_lat)**2))**0.5), drop_first = True, dtype = int)
yy = df_test.price

## 선형 모형 적합

## predictor
predictr = sklearn.linear_model.LinearRegression()
predictr.fit(X, y)

## prediction
yyhat = predictr.predict(XX)

## evaluation
scoring_dict["Linear Regression with Reduction"] = np.mean((yy - yyhat)**2)**0.5
print(f"RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

RMSE = 192870.6356


In [12]:
## 변수 제거로 인해 발생한 효과는 아닌지 확인
X = pd.get_dummies(df_train.drop(["date", "price", "lat", "long"], axis = 1).assign(month = train_date[0]), drop_first = True, dtype = int)
y = df_train.price

XX = pd.get_dummies(df_test.drop(["date", "price", "lat", "long"], axis = 1).assign(month = test_date[0]), drop_first = True, dtype = int)
yy = df_test.price

## 선형 모형 적합

## predictor
predictr = sklearn.linear_model.LinearRegression()
predictr.fit(X, y)

## prediction
yyhat = predictr.predict(XX)

## evaluation
rmse2 = np.mean((yy - yyhat)**2)**0.5
print(f"RMSE = {np.mean((yy - yyhat)**2)**0.5:.4f}")

RMSE = 211999.1422


`-` PCA 및 PCR / 사용할 주성분의 수 교차검증

In [22]:
## data
X = pd.get_dummies(df_train.drop(["date", "price", "lat", "long"], axis = 1).assign(month = train_date[0]).assign(distance = (df_train.long.map(lambda x : (x - central_long)**2) + df_train.lat.map(lambda x : (x - central_lat)**2))**0.5), drop_first = True, dtype = int)
y = df_train.price

XX = pd.get_dummies(df_test.drop(["date", "price", "lat", "long"], axis = 1).assign(month = test_date[0]).assign(distance = (df_test.long.map(lambda x : (x - central_long)**2) + df_test.lat.map(lambda x : (x - central_lat)**2))**0.5), drop_first = True, dtype = int)
yy = df_test.price

## 초기화
mean_scores = []
std_scores = []

## CV
for i in range(28) :
    scores = []
    pca = PCA(n_components = i+1)
    pca.fit(X)
    
    X_reduction = pca.transform(X)
    XX_reduction = pca.transform(XX)
    
    kfold = KFold(10, random_state = 14107, shuffle = True)
    
    for train_idx, valid_idx in kfold.split(X_reduction):
        X_train, X_valid = X_reduction[train_idx], X_reduction[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        predictr = sklearn.linear_model.LinearRegression()
        predictr.fit(X_train, y_train)
        yyhat = predictr.predict(X_valid)
        rmse = np.mean((y_valid - yyhat)**2)**0.5
        scores.append(rmse)
    
    mean_scores.append(np.mean(scores))
    std_scores.append(np.std(scores))

In [None]:
## optimized value
indx = np.where(mean_scores == np.min(mean_scores))[0][0]
min_mse = mean_scores[indx]
min_std = std_scores[indx]
simple_indx = np.min(np.where(mean_scores <= min_mse + min_std / np.sqrt(5)))

## plotting
fig, ax = plt.subplots(figsize=(8,8))
ax.errorbar(list(range(1, 29)), mean_scores, yerr = std_scores/np.sqrt(5))
ax.axvline(list(range(1, 29))[indx], c='k', ls='--')
ax.axhline(min_mse + min_std / np.sqrt(5), c = 'k', ls = '--')
ax.set_xlabel('PC$_i$', fontsize=20)
ax.set_ylabel('Cross-validated MSE', fontsize=20)
ax.scatter(list(range(1, 29))[simple_indx], mean_scores[simple_indx],
           color = "red", s = 15, label = "1-se rule selection", zorder = 5)
ax.set_title(f"PC$_i$에서의 CV MSE")

plt.legend()
plt.show()

## optimal lambda
optim_lambda = list(range(1, 29))[indx] ## 26
optim_1se_lambda = list(range(1, 29))[simple_indx] ## 25
print(f"Optimal : {optim_lambda:.4f}")
print(f"1-se rule : {optim_1se_lambda:.4f}")

In [42]:
## data
X = pd.get_dummies(df_train.drop(["date", "price", "lat", "long"], axis = 1).assign(month = train_date[0]).assign(distance = (df_train.long.map(lambda x : (x - central_long)**2) + df_train.lat.map(lambda x : (x - central_lat)**2))**0.5), drop_first = True, dtype = int)
y = df_train.price

XX = pd.get_dummies(df_test.drop(["date", "price", "lat", "long"], axis = 1).assign(month = test_date[0]).assign(distance = (df_test.long.map(lambda x : (x - central_long)**2) + df_test.lat.map(lambda x : (x - central_lat)**2))**0.5), drop_first = True, dtype = int)
yy = df_test.price

pca1 = PCA(n_components = 26)
X_reduction1 = pca1.fit_transform(X)
XX_reduction1 = pca1.transform(XX)

pca2 = PCA(n_components = 25)
X_reduction2 = pca2.fit_transform(X)
XX_reduction2 = pca2.transform(XX)

## 적합 및 평가
predictr1 = sklearn.linear_model.LinearRegression()
predictr1.fit(X_reduction1, y)
yyhat1 = predictr1.predict(XX_reduction1)
scoring_dict["PCR with 26th PC"] = np.mean((yy-yyhat1)**2)**0.5
print(f"RMSE = {np.mean((yy-yyhat1)**2)**0.5}")

predictr2 = sklearn.linear_model.LinearRegression()
predictr2.fit(X_reduction2, y)
yyhat2 = predictr2.predict(XX_reduction2)
print(f"RMSE = {np.mean((yy-yyhat2)**2)**0.5}")

RMSE = 192850.14039133323
RMSE = 195149.43103947132


`-` 선형 모형 간 비교

In [None]:
df_scoring = pd.DataFrame({"Method" : scoring_dict.keys(), "test MSE" : scoring_dict.values()}).sort_values("test MSE", ascending = False).reset_index(drop = True)
tidy = df_scoring.iloc[[4, 5, 6, 7], :]

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = tidy["test MSE"],
        y = tidy.Method,
        text = tidy["test MSE"],
        texttemplate = "%{text:.4f}",
        orientation = "h",
        marker_line=dict(width=5, color='black')
    )
)

fig.update_yaxes(categoryorder = "total ascending")
fig["data"][0]["marker"]["color"] = ["skyblue", "skyblue", "skyblue", "skyblue"]
fig["data"][0]["marker"]["line"]["color"] = ["skyblue", "skyblue", "red", "red"]

fig.show()

## **11. 결론**

In [None]:
## 각 방법론 별 성능 지표 비교 : RMSE
df_score = pd.DataFrame({"method" : scoring_dict.keys(), "score" : scoring_dict.values()}).iloc[[1, 2, 12, 6, 9, 8]]

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = df_score.score,
        y = df_score.method,
        text = df_score.score,
        orientation = "h",
        texttemplate = "%{text:.4f}"
    )
)

fig.update_yaxes(categoryorder = "total ascending")
fig.update_layout(height = 800, title = dict(text="각 방법 별 최적 모형에서의 적합 결과 비교(RMSE)", font=dict(size=30), x = 0.5, y = 0.95))
fig["data"][0]["marker"]["color"] = ["skyblue"]*5 + ["orange"]

fig.show()