In [1]:
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

from scipy import stats
from scipy.stats import norm, skew #for some statistics

In [2]:
## データ読み込み
train_org = pd.read_csv("../data/train_set.csv", low_memory=False)
test_org = pd.read_csv("../data/test_set.csv", low_memory=False)

In [3]:
## データの準備
df_all = pd.concat([train_org, test_org], axis=0).reset_index(drop=True)
df_all = df_all.drop(["PRICE"], axis=1)

all_r = df_all.query('SOURCE=="Residential"')
all_c = df_all.query('SOURCE=="Condominium"')

In [4]:
## 欠損値補完の際に不要と判明したカラムを除去
useless_cols_r = ["Id","ASSESSMENT_SUBNBHD","FULLADDRESS","NATIONALGRID",
                  "CENSUS_BLOCK","CITY","STATE","X","Y","QUADRANT"]
useless_cols_c = ["Id","ASSESSMENT_SUBNBHD"]

for c in useless_cols_r:
    all_r.drop([c], axis=1, inplace=True)

for c in useless_cols_c:
    all_c.drop([c], axis=1, inplace=True)

In [5]:
## そのほか、ユニークなデータ数を確認、１つしかない(欠損Onlyも含む)物は除去
for c in all_r.columns:
    if len(all_r[c].dropna().unique()) <= 1:
        print(c)
        all_r.drop([c], axis=1, inplace=True)

print("~"*30)

for c in all_c.columns:
    if len(all_c[c].dropna().unique()) <= 1:
        print(c)
        all_c.drop([c], axis=1, inplace=True)

CMPLX_NUM
GIS_LAST_MOD_DTTM
LIVING_GBA
SOURCE
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BLDG_NUM
CENSUS_BLOCK
CITY
CNDTN
EXTWALL
FULLADDRESS
GBA
GIS_LAST_MOD_DTTM
GRADE
INTWALL
KITCHENS
NATIONALGRID
NUM_UNITS
ROOF
SOURCE
STATE
STORIES
STRUCT
STYLE


In [6]:
## 数値とカテゴリの仕分け
# all_r.info()

nc_resi = []
cc_resi = []
for c in all_r.columns:
    if all_r[c].dtype in (np.int64, np.float64):
        nc_resi.append(c)
    else:
        cc_resi.append(c)

nc_cond = []
cc_cond = []
for c in all_c.columns:
    if all_c[c].dtype in (np.int64, np.float64):
        nc_cond.append(c)
    else:
        cc_cond.append(c)
        
## 数値の中で、本当はカテゴリカルなデータを移動
col_move = ["SALE_NUM", "USECODE", "ZIPCODE"]
for c in col_move:
    nc_resi.remove(c)
    cc_resi.append(c)
    nc_cond.remove(c)
    cc_cond.append(c)

In [7]:
## カテゴリデータのユニークな数チェック
print("Resi")
for c in cc_resi:
    print(c, len(all_r[c].unique()))

print()
    
print("Cond")
for c in cc_cond:
    print(c, len(all_c[c].unique()))

Resi
AC 3
ASSESSMENT_NBHD 55
CNDTN 7
EXTWALL 24
GRADE 13
HEAT 14
INTWALL 12
QUALIFIED 2
ROOF 16
SALEDATE 6440
SQUARE 3119
STRUCT 8
STYLE 18
WARD 8
SALE_NUM 15
USECODE 8
ZIPCODE 21

Cond
AC 3
ASSESSMENT_NBHD 47
HEAT 14
QUADRANT 4
QUALIFIED 2
SALEDATE 5623
SQUARE 998
WARD 8
SALE_NUM 10
USECODE 4
ZIPCODE 23


In [8]:
## ひとまず多すぎるSALEDATEとSQUAREを除去
all_r.drop(["SALEDATE","SQUARE"], axis=1, inplace=True)
all_c.drop(["SALEDATE","SQUARE"], axis=1, inplace=True)

In [9]:
## 残ったカラムを精査
# list(all_r.columns)
cc_cond

['AC',
 'ASSESSMENT_NBHD',
 'HEAT',
 'QUADRANT',
 'QUALIFIED',
 'SALEDATE',
 'SQUARE',
 'WARD',
 'SALE_NUM',
 'USECODE',
 'ZIPCODE']