## 2nd level. Zillow Prize: Zillow's Home Value Prediction (Zestimate)

- [자료1](https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-zillow-prize)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
%matplotlib inline
color = sns.color_palette()

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input/zillow-prize-1"]).decode("utf8"))

### train_2016_v2

In [None]:
train = pd.read_csv("../input/zillow-prize-1/train_2016_v2.csv", parse_dates=["transactiondate"])
train.shape

In [None]:
train.head()

아직 정확히 뭔지 모르겠지만, log + error라는 특이한 항목의 float 값들이 보인다. 그려보자.

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(range(train.shape[0]), np.sort(train.logerror.values))
# plt.xlabel("index", fontsize=12)
plt.ylabel("logerror", fontsize=12)
plt.show()

In [None]:
ulimit = np.percentile(train.logerror.values, 99)
llimit = np.percentile(train.logerror.values, 1)
train.logerror[train.logerror > ulimit] = ulimit
train.logerror[train.logerror < llimit] = llimit

In [None]:
plt.figure(figsize=(12, 8))
sns.distplot(train.logerror.values, bins=50, kde=False)
plt.xlabel("logerror", fontsize=12)
plt.show()

각 극의 1%를 outlier로 지정하여 boundary 안에 들어오게 바꿨다.

다음으로는 해당 데이터의 생성 날짜로 보이는 데이터를 시각화해서 확인하자.

In [None]:
train["transaction_month"] = train.transactiondate.dt.month

In [None]:
cnt = train.transaction_month.value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(cnt.index, cnt.values, alpha=.8, color=color[4])

plt.xticks(rotation="vertical")
plt.xlabel("Month of transcation", fontsize=12)
plt.ylabel("Number of Occurrences", fontsize=12)
plt.show()

### properties 2016

In [None]:
prop = pd.read_csv("../input/zillow-prize-1/properties_2016.csv")

In [None]:
prop.shape

In [None]:
prop.head()

In [None]:
missing = prop.isnull().sum(axis=0).reset_index()
missing.columns = ["column_name", "missing_count"]
missing = missing[missing.missing_count > 0]
missing = missing.sort_values(by="missing_count")

In [None]:
ind = np.arange(missing.shape[0])
fig, ax = plt.subplots(figsize=(12, 18))
rects = ax.barh(ind, missing.missing_count.values, color="blue")

ax.set_xlabel("Count of missing values")
ax.set_yticks(ind)
ax.set_yticklabels(missing.column_name.values, rotation="horizontal")
ax.set_title("Number of missing values in each column")
plt.show()

#### missingno 이용하기지만, 제대로 기억나지 않는다.

In [None]:
# import missingno as msno
# msno.plot(missing)
# plt.show()

In [None]:
plt.figure(figsize=(12, 12))
sns.jointplot(x=prop.latitude.values, y=prop.longitude.values, size=10)
plt.ylabel("Longitude", fontsize=12)
plt.xlabel("Latitude", fontsize=12)
plt.show()

### train + prop

In [None]:
train = pd.merge(train, prop, on="parcelid", how="left")

In [None]:
train.head()

In [None]:
pd.options.display.max_rows = 65

In [None]:
dtype_df = train.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df

In [None]:
dtype_df.groupby("Column Type").aggregate("count").reset_index()

In [None]:
missing = train.isnull().sum(axis=0).reset_index()
missing.columns = ["column_name", "missing_count"]
missing["missing_rate"] = missing["missing_count"] / train.shape[0]
missing[missing.missing_rate > 0.999]

### Univariate Analysis

In [None]:
mean_values = train.mean(axis=0)
train.fillna(mean_values, inplace=True)
train_new = train.copy(deep=True)

In [None]:
x_cols = [col for col in train_new.columns
          if col not in ["logerror"] if train_new[col].dtype == "float64"]

In [None]:
labels = []; values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(train_new[col].values, train_new.logerror.values)[0, 1])
corr_df = pd.DataFrame({"col_labels": labels, "corr_values": values})
corr_df = corr_df.sort_values(by="corr_values")

In [None]:
fig, ax = plt.subplots(figsize=(12, 40))
rects = ax.barh(np.arange(len(labels)), np.array(corr_df.corr_values.values), color='y')

ax.set_xlabel("Correlation coefficient")
ax.set_yticks(np.arange(len(labels)))
ax.set_yticklabels(corr_df.col_labels.values, rotation="horizontal")
ax.set_title("Correlation coefficient of the variables (logerror)")
plt.show()

In [None]:
corr_zero_cols = ["assessmentyear", "storytypeid", "pooltypeid2", "pooltypeid7",
                  "pooltypeid10", "poolcnt", "decktypeid", "buildingclasstypeid"]
for col in corr_zero_cols:
    print(col, len(train_new[col].unique()))

상관계수 값이 높은 feature를 가져와서 확인해보자.

In [None]:
corr_df_sel = corr_df[(corr_df.corr_values > 0.02) | (corr_df.corr_values < -0.01)]
corr_df_sel

In [None]:
temp_df = train[corr_df_sel.col_labels.tolist()]
corrmat = temp_df.corr(method="spearman")

logerror에 상관계수가 작은 얘들이 서로 관련되어 있는지 확인한다?

In [None]:
f, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(corrmat, vmax=1, vmin=-1, square=True)
plt.title("Important variables correlation map", fontsize=15)
plt.show()

#### Finished SquareFeet 12

In [None]:
col = "finishedsquarefeet12"
ulimit = np.percentile(train[col].values, 99.5)
llimit = np.percentile(train[col].values, 0.5)
train[col][train[col] > ulimit] = ulimit
train[col][train[col] < llimit] = llimit

In [None]:
plt.figure(figsize=(12, 12))
sns.jointplot(x=train.finishedsquarefeet12.values, y=train.logerror.values,
              size=10, color=color[4])

plt.xlabel("Finished square feet 12", fontsize=12)
plt.ylabel("Log error", fontsize=12)
plt.title("Finished square feet 12 Vs Log error", fontsize=12)
plt.show()

#### Calculated finished square feet:

In [None]:
col = "calculatedfinishedsquarefeet"
ulimit = np.percentile(train[col].values, 99.5)
llimit = np.percentile(train[col].values, 0.5)
train[col][train[col] > ulimit] = ulimit
train[col][train[col] < llimit] = llimit

In [None]:
plt.figure(figsize=(12, 12))
sns.jointplot(x=train.calculatedfinishedsquarefeet.values, y=train.logerror.values,
              size=10, color=color[5])

plt.xlabel("Calculated finished square feet", fontsize=12)
plt.ylabel("Log Error", fontsize=12)
plt.title("Calculated finished square feet Vs Log error", fontsize=15)
plt.show()

#### Bathroom Count:

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot("bathroomcnt", data=train)

plt.xticks(rotation="vertical")
plt.xlabel("Bathroom", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Frequency of Bathroom count", fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x="bathroomcnt", y="logerror", data=train)

plt.xticks(rotation="vertical")
plt.xlabel("Bathroom Count", fontsize=12)
plt.ylabel("Log error", fontsize=12)
plt.title("How log error changes with bathroom count?", fontsize=12)
plt.show()

#### Bedroom count:

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x="bedroomcnt", data=train)

plt.xticks(rotation="vertical")
plt.xlabel("Bedroom Count", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Frequency of Bedroom count", fontsize=15)
plt.show()

대강 평균을 3.03으로 하셨답니다. 근데 업데이트 되면서 평균이 3이 된 거 같은데.<br />
아니다, 이상한 값이 사라진 것 같기도 하다.

In [None]:
train["bedroomcnt"][train.bedroomcnt > 7] = 7

In [None]:
plt.figure(figsize=(12, 8))
sns.violinplot(x="bedroomcnt", y="logerror", data=train)
plt.xlabel("Bedroom count", fontsize=12)
plt.ylabel("Log Error", fontsize=12)
plt.show()

In [None]:
col = "taxamount"
ulimit = np.percentile(train[col].values, 99.5)
llimit = np.percentile(train[col].values, 0.5)
train[col][train[col] > ulimit] = ulimit
train[col][train[col] < llimit] = llimit

In [None]:
plt.figure(figsize=(12, 12))
sns.jointplot(x=train.taxamount.values, y=train.logerror.values, size=10, color='g')

plt.xlabel("Tax Amount", fontsize=12)
plt.ylabel("Log Error", fontsize=12)
plt.title("Tax Amount Vs Log error", fontsize=15)
plt.show()

#### YearBuilt:

In [None]:
from ggplot import *

In [None]:
ggplot(aes("yearbuilt", "logerror"), data=train) + \
    geom_point(color="steelblue", size=1) + \
    stat_smooth()

In [None]:
ggplot(aes("latitude", "longitude", color="logerror"), data=train) + \
    geom_point() + \
    scale_color_gradient(low="red", high="blue")

In [None]:
ggplot(aes("finishedsquarefeet12", "taxamount", color="logerror"), data=train) + \
    geom_point(alpha=.7) + \
    scale_color_gradient(low="pink", high="blue")

In [None]:
ggplot(aes("finishedsquarefeet12", "taxamount", color="logerror"), data=train) + \
    geom_now_its_art()

뭐지, 위 그림에서 가장 가까운 그림을 찾아주는 거 같은데 원래 커널로 봐도 새가 안 되는 거 같은데 관련된 그림이 없던 건가<br />
뭐지, 원본 커널은 nice pattern을 찾았다고 좋아하네?<br />
설마 새가 뜻하는 게 지수적 관계를 말하는 건가?

In [None]:
train_y = train.logerror.values
cat_cols = ["hashottuborspa", "propertycountylandusecode", "propertyzoningdesc",
            "fireplaceflag", "taxdelinquencyflag"]
train = train.drop(["parcelid", "logerror", "transactiondate",
                    "transaction_month"] + cat_cols, axis=1)
feat_names = train.columns.values

In [None]:
from sklearn import ensemble
model = ensemble.ExtraTreesRegressor(n_estimators=25, max_depth=30, max_features=0.3,
                                     n_jobs=-1, random_state=0)
model.fit(train, train_y)

In [None]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]

In [None]:
plt.figure(figsize=(12, 12))
plt.bar(range(len(indices)), importances[indices], color='r', yerr=std[indices],
        align="center")

plt.xticks(range(len(indices)), feat_names[indices], rotation="vertical")
plt.xlim([-1, len(indices)])
plt.title("Feature importances")
plt.show()

In [None]:
import xgboost as xgb
xgb_params = {"eta": 0.05, "max_depth": 8, "subsample": 0.7, "colsample_bytree": 0.7,
              "objective": "reg:linear", "silent": 1, "seed": 0}

In [None]:
dtrain = xgb.DMatrix(train, train_y, feature_names=train.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

In [None]:
fig, ax = plt.subplots(figsize=(12, 18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()