# Data Analysis

In [2]:
import numpy as np
import pandas as pd
from quantile_forest import RandomForestQuantileRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder


random_state = 0
np.random.seed(random_state)

# Competition variables.
base_path = "../prediction-interval-competition-ii-house-price/"
alpha = 0.1

In [3]:
df = pd.read_csv(base_path + "dataset.csv", index_col="id", parse_dates=["sale_date"])
df_test = pd.read_csv(base_path + "test.csv", index_col="id", parse_dates=["sale_date"])

## Visualization

In [5]:
print(df.columns)

       'join_year', 'latitude', 'longitude', 'area', 'city', 'zoning',
       'subdivision', 'present_use', 'land_val', 'imp_val', 'year_built',
       'year_reno', 'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt', 'grade',
       'fbsmt_grade', 'condition', 'stories', 'beds', 'bath_full', 'bath_3qtr',
       'bath_half', 'garb_sqft', 'gara_sqft', 'wfnt', 'golf', 'greenbelt',
       'noise_traffic', 'view_rainier', 'view_olympics', 'view_cascades',
       'view_territorial', 'view_skyline', 'view_sound', 'view_lakewash',
       'view_lakesamm', 'view_otherwater', 'view_other', 'submarket'],
      dtype='object')


In [9]:
from collections import defaultdict

dict_cnt = defaultdict(int)
target = df['city']

for row in target:
    dict_cnt[row] += 1

display(dict_cnt)

defaultdict(int,
            {'FEDERAL WAY': 7616,
             'KIRKLAND': 8563,
             'RENTON': 9368,
             'BURIEN': 3823,
             'KING COUNTY': 27127,
             'SEATTLE': 59001,
             'KENT': 9429,
             'SAMMAMISH': 9563,
             'AUBURN': 6249,
             'COVINGTON': 3104,
             'ISSAQUAH': 3671,
             'BELLEVUE': 10691,
             'MAPLE VALLEY': 4958,
             'BOTHELL': 2124,
             'SeaTac': 1571,
             'REDMOND': 4782,
             'LAKE FOREST PARK': 1410,
             'KENMORE': 2561,
             'NORTH BEND': 1033,
             'NEWCASTLE': 1438,
             'SHORELINE': 5142,
             'WOODINVILLE': 1251,
             'MERCER ISLAND': 2200,
             'BLACK DIAMOND': 786,
             'DES MOINES': 2620,
             'SEA-TAC': 160,
             'SNOQUALMIE': 2576,
             'TUKWILA': 1102,
             'DUVALL': 1485,
             'PACIFIC': 602,
             'CARNATION': 354,
  

In [8]:
dict = defaultdict(float)
target = df['city']

for idx, row in df.iterrows():
    city = row['city']
    dict[city] += row['sale_price']

display(dict)

defaultdict(float,
            {'FEDERAL WAY': 2636423788.0,
             'KIRKLAND': 6275040576.0,
             'RENTON': 4101012748.0,
             'BURIEN': 1529973484.0,
             'KING COUNTY': 15160733148.0,
             'SEATTLE': 37070301267.0,
             'KENT': 3464675325.0,
             'SAMMAMISH': 7362169691.0,
             'AUBURN': 2235434831.0,
             'COVINGTON': 1086358346.0,
             'ISSAQUAH': 2706800190.0,
             'BELLEVUE': 8952149663.0,
             'MAPLE VALLEY': 2034117294.0,
             'BOTHELL': 1317291529.0,
             'SeaTac': 493835565.0,
             'REDMOND': 3445516216.0,
             'LAKE FOREST PARK': 829376157.0,
             'KENMORE': 1465381635.0,
             'NORTH BEND': 676486038.0,
             'NEWCASTLE': 1168353904.0,
             'SHORELINE': 2511506996.0,
             'WOODINVILLE': 824492503.0,
             'MERCER ISLAND': 2672095751.0,
             'BLACK DIAMOND': 422692789.0,
             'DES MOINES': 

In [12]:
dict_avg = defaultdict(float)
for col in dict.keys():
    dict_avg[col] = dict[col] / dict_cnt[col]

display(dict_avg)

defaultdict(float,
            {'FEDERAL WAY': 346169.0898109244,
             'KIRKLAND': 732808.6623846783,
             'RENTON': 437768.2267292912,
             'BURIEN': 400202.3238294533,
             'KING COUNTY': 558879.8299848859,
             'SEATTLE': 628299.5418213251,
             'KENT': 367448.8625517022,
             'SAMMAMISH': 769859.8442957231,
             'AUBURN': 357726.8092494799,
             'COVINGTON': 349986.58054123714,
             'ISSAQUAH': 737346.8237537455,
             'BELLEVUE': 837353.8175100551,
             'MAPLE VALLEY': 410269.7244856797,
             'BOTHELL': 620193.7518832391,
             'SeaTac': 314344.7262889879,
             'REDMOND': 720517.8201589293,
             'LAKE FOREST PARK': 588210.0404255319,
             'KENMORE': 572191.1889886763,
             'NORTH BEND': 654875.1577928364,
             'NEWCASTLE': 812485.3296244785,
             'SHORELINE': 488429.9875534811,
             'WOODINVILLE': 659066.7490007994,
 