### King County Data

In [45]:
import warnings

import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots

%matplotlib inline

from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
housing_price = pd.read_csv("kc_house_data.csv")

## 전처리
df_preprocessed = housing_price.drop(["id", "date", "price"], axis = 1)\
.assign(date = pd.to_datetime(housing_price.date)).assign(price = housing_price.price)

## scores recoding
scoring_dict = dict()

In [3]:
def month_days_split(df_train : pd.DataFrame, df_test : pd.DataFrame) :
    """
    train/test set에서 `date` 열을 월별과 일별로 따로 분석하기 위한 함수
    """
    test_month = df_test.date.dt.month.astype(str)
    test_days = df_test.date.map(lambda x : x - df_preprocessed.date.min()).dt.days
    train_month = df_train.date.dt.month.astype(str)
    train_days = df_train.date.map(lambda x : x - df_preprocessed.date.min()).dt.days
    
    return [train_month, train_days], [test_month, test_days]

df_train, df_test = train_test_split(df_preprocessed, test_size = 0.3, shuffle = True, random_state = 14107)
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

train_date, test_date = month_days_split(df_train, df_test)

In [4]:
X = df_train.drop(["date", "price"], axis = 1).assign(days = train_date[1]) ## 일별
y = df_train.price

XX = df_test.drop(["date", "price"], axis = 1).assign(days = test_date[1])
yy = df_test.price

In [None]:
central_location = X.loc[df_train.price == df_train.price.max(), ["long", "lat"]]
location = X[["long", "lat"]]

df_distance = location.assign(long_sq = location.long.map(lambda x : (x-central_location.iloc[0,0])**2)).assign(lat_sq = location.lat.map(lambda x : (x-central_location.iloc[0,1])**2))\
    .assign(distance = lambda _df : (_df.long_sq + _df.lat_sq)**0.5)