In [1]:
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

from scipy import stats
from scipy.stats import norm, skew #for some statistics

In [2]:
## データ読み込み
train_org = pd.read_csv("../data/train_set.csv", low_memory=False)
test_org = pd.read_csv("../data/test_set.csv", low_memory=False)

In [3]:
# ## 外れ値除去
# train = train_org.drop(
#     train_org[(train_org["PRICE"]>=100000000) | (train_org["PRICE"]<=500)].index
# )

In [4]:
## データの準備
df_all = pd.concat([train_org, test_org], axis=0).reset_index(drop=True)
df_all = df_all.drop(["PRICE"], axis=1)

all_r = df_all.query('SOURCE=="Residential"')
all_c = df_all.query('SOURCE=="Condominium"')

In [5]:
## 欠損率チェック
display(all_r.isnull().sum() / all_r.shape[0] * 100)
print()
display(all_c.isnull().sum() / all_c.shape[0] * 100)

AC                      0.000000
ASSESSMENT_NBHD         0.000000
ASSESSMENT_SUBNBHD     13.823834
AYB                     0.164076
BATHRM                  0.000000
BEDRM                   0.000000
BLDG_NUM                0.000000
CENSUS_BLOCK            0.571675
CENSUS_TRACT            0.000000
CITY                    0.571675
CMPLX_NUM             100.000000
CNDTN                   0.000000
EXTWALL                 0.000000
EYB                     0.000000
FIREPLACES              0.000000
FULLADDRESS             0.578584
GBA                     0.000000
GIS_LAST_MOD_DTTM       0.000000
GRADE                   0.000000
HEAT                    0.000000
HF_BATHRM               0.000000
INTWALL                 0.000000
Id                      0.000000
KITCHENS                0.001727
LANDAREA                0.000000
LATITUDE                0.000000
LIVING_GBA            100.000000
LONGITUDE               0.000000
NATIONALGRID            0.571675
NUM_UNITS               0.000000
QUADRANT  




AC                      0.000000
ASSESSMENT_NBHD         0.000000
ASSESSMENT_SUBNBHD     30.387439
AYB                     0.042167
BATHRM                  0.000000
BEDRM                   0.000000
BLDG_NUM                0.000000
CENSUS_BLOCK          100.000000
CENSUS_TRACT            0.000000
CITY                  100.000000
CMPLX_NUM               0.000000
CNDTN                 100.000000
EXTWALL               100.000000
EYB                     0.000000
FIREPLACES              0.000000
FULLADDRESS           100.000000
GBA                   100.000000
GIS_LAST_MOD_DTTM       0.000000
GRADE                 100.000000
HEAT                    0.000000
HF_BATHRM               0.000000
INTWALL               100.000000
Id                      0.000000
KITCHENS              100.000000
LANDAREA                0.000000
LATITUDE                0.000000
LIVING_GBA              0.000000
LONGITUDE               0.000000
NATIONALGRID          100.000000
NUM_UNITS             100.000000
QUADRANT  

In [6]:
## 欠損値補完
df_all.loc[df_all["AYB"].isnull(), "AYB"] = df_all[df_all["AYB"].isnull()]["EYB"]
df_all.loc[df_all["YR_RMDL"].isnull(), "YR_RMDL"] = df_all[df_all["YR_RMDL"].isnull()]["AYB"]

all_r = df_all.query('SOURCE=="Residential"')
all_c = df_all.query('SOURCE=="Condominium"')

In [7]:
## 欠損率チェック
display(all_r.isnull().sum() / all_r.shape[0] * 100)
print()
display(all_c.isnull().sum() / all_c.shape[0] * 100)

AC                      0.000000
ASSESSMENT_NBHD         0.000000
ASSESSMENT_SUBNBHD     13.823834
AYB                     0.000000
BATHRM                  0.000000
BEDRM                   0.000000
BLDG_NUM                0.000000
CENSUS_BLOCK            0.571675
CENSUS_TRACT            0.000000
CITY                    0.571675
CMPLX_NUM             100.000000
CNDTN                   0.000000
EXTWALL                 0.000000
EYB                     0.000000
FIREPLACES              0.000000
FULLADDRESS             0.578584
GBA                     0.000000
GIS_LAST_MOD_DTTM       0.000000
GRADE                   0.000000
HEAT                    0.000000
HF_BATHRM               0.000000
INTWALL                 0.000000
Id                      0.000000
KITCHENS                0.001727
LANDAREA                0.000000
LATITUDE                0.000000
LIVING_GBA            100.000000
LONGITUDE               0.000000
NATIONALGRID            0.571675
NUM_UNITS               0.000000
QUADRANT  




AC                      0.000000
ASSESSMENT_NBHD         0.000000
ASSESSMENT_SUBNBHD     30.387439
AYB                     0.000000
BATHRM                  0.000000
BEDRM                   0.000000
BLDG_NUM                0.000000
CENSUS_BLOCK          100.000000
CENSUS_TRACT            0.000000
CITY                  100.000000
CMPLX_NUM               0.000000
CNDTN                 100.000000
EXTWALL               100.000000
EYB                     0.000000
FIREPLACES              0.000000
FULLADDRESS           100.000000
GBA                   100.000000
GIS_LAST_MOD_DTTM       0.000000
GRADE                 100.000000
HEAT                    0.000000
HF_BATHRM               0.000000
INTWALL               100.000000
Id                      0.000000
KITCHENS              100.000000
LANDAREA                0.000000
LATITUDE                0.000000
LIVING_GBA              0.000000
LONGITUDE               0.000000
NATIONALGRID          100.000000
NUM_UNITS             100.000000
QUADRANT  

In [8]:
## STORIESを補完
story_tmp = all_r[all_r["STORIES"].isnull()]["STYLE"]
story_tmp = story_tmp.replace(" ","", regex=True)
story_tmp = story_tmp.replace("Story","", regex=True)
story_tmp = story_tmp.replace("Fin","", regex=True)
story_tmp = story_tmp.replace("Unfin","", regex=True)
story_tmp = story_tmp.astype(np.float64)

all_r.loc[all_r["STORIES"].isnull(), "STORIES"] = story_tmp