In [None]:
# King County의 집값 예측

딥러닝에서 배운 내용을 토대로 King County의 집값 예측

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

%load_ext tensorboard

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import pandas as pd
from sklearn.utils import shuffle

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

df = shuffle(df, random_state = 42)

In [3]:
df['date']=  pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d'))
df["date"] = df["date"].astype('int64')

In [4]:
X_train = df.drop(["price"], axis = 1)
y_train = df["price"].copy()

In [5]:
X_train

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
735,2591820310,20141006,4,2.25,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
2830,7974200820,20140821,5,3.00,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
4106,7701450110,20140815,4,2.50,3770,10893,2.0,0,2,3,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
16218,9522300010,20150331,3,3.50,4560,14608,2.0,0,2,3,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
19964,9510861140,20140714,3,2.50,2550,5376,2.0,0,0,3,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,5272200045,20141113,3,1.50,1000,6914,1.0,0,0,3,7,1000,0,1947,0,98125,47.7144,-122.319,1000,6947
21575,9578500790,20141111,3,2.50,3087,5002,2.0,0,0,3,8,3087,0,2014,0,98023,47.2974,-122.349,2927,5183
5390,7202350480,20140930,3,2.50,2120,4780,2.0,0,0,3,7,2120,0,2004,0,98053,47.6810,-122.032,1690,2650
860,1723049033,20140620,1,0.75,380,15000,1.0,0,0,3,5,380,0,1963,0,98168,47.4810,-122.323,1170,15000


In [6]:
def build_model():
    model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=[20]),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(400, activation="selu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(400, activation="selu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(400, activation="selu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(400, activation="selu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(400, activation="selu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(1)
    ])

    model.compile(loss="mse", optimizer="adam", metrics=["mse"])
    # loss = ???
    
    return model

In [7]:
k = 10
num_val = len(X_train) // k
all_scores = []
num_epochs = 100

np.random.seed(42)
tf.random.set_seed(8)

for i in range(k):
    print('processing fold #', i)

    X_val = X_train[i * num_val: (i + 1) * num_val]
    y_val = y_train[i * num_val: (i + 1) * num_val]

    X_train_part = np.concatenate(
        [X_train[:i * num_val],
        X_train[(i + 1) * num_val:]],
        axis = 0)

    y_train_part = np.concatenate(
        [y_train[:i * num_val],
        y_train[(i + 1) * num_val:]],
        axis = 0)

    model = build_model()
    history = model.fit(X_train_part, y_train_part,
             epochs = num_epochs, verbose = 0)

    val_loss, val_acc = model.evaluate(X_val, y_val, verbose = 0)
    all_scores.append(val_acc)

print(f"검증 결과 : {all_scores}")

print(f"평균 정확도 : {np.mean(all_scores)}")

processing fold # 0
processing fold # 1
processing fold # 2
processing fold # 3
processing fold # 4
processing fold # 5
processing fold # 6
processing fold # 7
processing fold # 8
processing fold # 9
검증 결과 : [17026025472.0, 13737507840.0, 13069842432.0, 18337751040.0, 16216544256.0, 12362467328.0, 13354914816.0, 12116746240.0, 15034728448.0, 15236427776.0]
평균 정확도 : 14649295564.8


In [8]:
print(f"RMSE값 : {np.sqrt(np.mean(all_scores))}") # 기존 머신러닝처럼 값 확인

RMSE값 : 121034.27433913089
