In [6]:
import platform
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 그래프 한글 깨짐 방지
if platform.system() == 'Darwin':
    # 맥일때
    plt.rcParams["font.family"] = 'AppleGothic'
elif platform.system() == 'Windows':
    # 윈도우일때
    plt.rcParams["font.family"] = 'Malgun Gothic'

# 마이너스기호(-) 폰트 깨지는 문제 해결
plt.rcParams['axes.unicode_minus'] = False

In [7]:
# 데이터 디렉토리 파일 확인
import os
print(os.listdir("./input_2019-2nd-ml-month-with-kakr/"))

['sample_submission.csv', 'test.csv', 'train.csv']


In [8]:
# train, test 데이터 read(dataframe)
train = pd.read_csv("./input_2019-2nd-ml-month-with-kakr/train.csv")
test = pd.read_csv("./input_2019-2nd-ml-month-with-kakr/test.csv")

In [9]:
train.shape

(15035, 21)

In [10]:
test.shape

(6468, 20)

In [11]:
# train 데이터셋을 학습에 사용할 train feature 데이터프레임에 추가(price 칼럼 제외)
X_train = train.drop("price", axis=1)

In [12]:
# date( ID 칼럼 삭제
del X_train["id"]
del X_train["date"]

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 18 columns):
bedrooms         15035 non-null int64
bathrooms        15035 non-null float64
sqft_living      15035 non-null int64
sqft_lot         15035 non-null int64
floors           15035 non-null float64
waterfront       15035 non-null int64
view             15035 non-null int64
condition        15035 non-null int64
grade            15035 non-null int64
sqft_above       15035 non-null int64
sqft_basement    15035 non-null int64
yr_built         15035 non-null int64
yr_renovated     15035 non-null int64
zipcode          15035 non-null int64
lat              15035 non-null float64
long             15035 non-null float64
sqft_living15    15035 non-null int64
sqft_lot15       15035 non-null int64
dtypes: float64(4), int64(14)
memory usage: 2.1 MB


In [14]:
# 학습 결과 확인용 lable만으로 데이터 프레임 제작
Y_train = train["price"]

In [16]:
# 위와 같은 방법으로 결과 테스트(예측)를 위한 데이터 프레임 카피
X_test = test.copy()
del X_test["id"]
del X_test["date"]

In [18]:
# Decision Tree Regressor를 이용한 훈련
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

decition_tree_regressor = DecisionTreeRegressor()
decition_tree_regressor.fit(X_train, Y_train)
Y_pred = decition_tree_regressor.predict(X_train)
decition_tree_regressor_mse = mean_squared_error(Y_pred, Y_train)
decition_tree_regressor_rmse = np.sqrt(decition_tree_regressor_mse)
decition_tree_regressor_rmse

0.0

In [20]:
# 데이터셋 분할하여 확인
from sklearn.model_selection import cross_val_score

scores = cross_val_score(decition_tree_regressor, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [21]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)

Scores: [205520.20561967 220088.44806548 166783.37254482 175455.98495662
 160691.10992116 212725.79719518 186913.44208443 177910.98358101
 172228.86857461 168374.84304586]
Mean: 184669.30555888402
Standard deviation: 19817.09665965277


In [22]:
# Random Forest Regressor를 이용한 훈련
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, Y_train)
Y_pred = forest_reg.predict(X_train)
forest_mse = mean_squared_error(Y_pred, Y_train)
forest_rmse = np.sqrt(forest_mse)
forest_rmse



58148.507239315346

In [23]:
# 나무 100개로 훈련
forest_reg = RandomForestRegressor(n_estimators=100)
forest_reg.fit(X_train, Y_train)
Y_pred = forest_reg.predict(X_train)
forest_mse = mean_squared_error(Y_pred, Y_train)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

49133.96193069751

In [24]:
scores = cross_val_score(forest_reg, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(forest_rmse_scores)

Scores: [145570.57677713 154015.39311522 115760.05801892 146252.34764347
 123739.46041841 145119.36836838 138940.27642737 119394.54176353
 129007.70398718 128687.66019058]
Mean: 134648.73867102028
Standard deviation: 12380.99283883374


In [25]:
# Random Forest로 정답제출용 submission 데이터프레임 작성

forest_reg = RandomForestRegressor(n_estimators=100)
forest_reg.fit(X_train, Y_train)
Y_pred = forest_reg.predict(X_test)

submission = pd.DataFrame({
        "id": test["id"],
        "price": Y_pred
    })
submission.shape

(6468, 2)

In [26]:
submission.head(10)

Unnamed: 0,id,price
0,15035,486588.0
1,15036,494155.25
2,15037,1316290.2
3,15038,289291.2
4,15039,319590.45
5,15040,326994.06
6,15041,464808.34
7,15042,711404.71
8,15043,308039.98
9,15044,645416.75


In [27]:
# 정답제출용 데이터 프레임으로 정답제출 파일 작성
submission.to_csv('submission_HPP_rf.csv', index=False)