## 1. 데이터, 패키지 불러오기

In [144]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer                          

from sklearn.compose import ColumnTransformer,make_column_transformer

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

In [145]:
train = pd.read_csv("PJT002_train.csv", encoding = 'utf8')
test = pd.read_csv("PJT002_test.csv", encoding = 'utf-8')
validation = pd.read_csv("PJT002_validation.csv", encoding = 'utf8')

  interactivity=interactivity, compiler=compiler, result=result)


## 2. 전처리

1. 건물 승인일자 년 단위로 자르기 함수 (소수점 포함 10자리인 글자만 찾아 자르기)

   그 후, 2019에서 뺀 값으로 대체

In [146]:
def cut_func(x):
    if len(str(x)) == 10:
        return str(x)[:4]

In [147]:
train['dt_of_athrztn'] = train['dt_of_athrztn'].apply(cut_func)
test['dt_of_athrztn'] = test['dt_of_athrztn'].apply(cut_func)
validation['dt_of_athrztn'] = validation['dt_of_athrztn'].apply(cut_func)

In [148]:
train['dt_of_athrztn'] = pd.to_numeric(train['dt_of_athrztn'])
test['dt_of_athrztn'] = pd.to_numeric(test['dt_of_athrztn'])
validation['dt_of_athrztn'] = pd.to_numeric(validation['dt_of_athrztn'])

In [149]:
train['dt_of_athrztn'] = 2019 - train['dt_of_athrztn']
test['dt_of_athrztn'] = 2019 - test['dt_of_athrztn']
validation['dt_of_athrztn'] = 2019 - validation['dt_of_athrztn']

2. 지상층, 지하층, 건물채수 카테고리화

In [150]:
train_data = [train, test, validation]

In [151]:
# 지상층 카테고리화

for dataset in train_data:
    dataset.loc[dataset['ttl_grnd_flr'] == 0, 'ttl_grnd_flr'] = 0,
    dataset.loc[(dataset['ttl_grnd_flr'] >= 1) &  (dataset['ttl_grnd_flr'] <= 10), 'ttl_grnd_flr'] = 10, 
    dataset.loc[(dataset['ttl_grnd_flr'] > 10) &  (dataset['ttl_grnd_flr'] <= 20), 'ttl_grnd_flr'] = 20,
    dataset.loc[(dataset['ttl_grnd_flr'] > 20) &  (dataset['ttl_grnd_flr'] <= 30), 'ttl_grnd_flr'] = 30,
    dataset.loc[(dataset['ttl_grnd_flr'] > 30) &  (dataset['ttl_grnd_flr'] <= 40), 'ttl_grnd_flr'] = 40,
    dataset.loc[(dataset['ttl_grnd_flr'] > 40) &  (dataset['ttl_grnd_flr'] <= 55), 'ttl_grnd_flr'] = 55,
    dataset.loc[dataset['ttl_grnd_flr'] > 55, 'ttl_grnd_flr'] = 56

In [152]:
# 지하층 카테고리화

for dataset in train_data:
    dataset.loc[dataset['ttl_dwn_flr'] == 0, 'ttl_dwn_flr'] = 0,
    dataset.loc[(dataset['ttl_dwn_flr'] >= 1) &  (dataset['ttl_dwn_flr'] <= 4), 'ttl_dwn_flr'] = 4, 
    dataset.loc[(dataset['ttl_dwn_flr'] > 4) &  (dataset['ttl_dwn_flr'] <= 10), 'ttl_dwn_flr'] = 10,
    dataset.loc[(dataset['ttl_dwn_flr'] > 10) &  (dataset['ttl_dwn_flr'] <= 20), 'ttl_dwn_flr'] = 20,
    dataset.loc[dataset['ttl_dwn_flr'] > 20, 'ttl_dwn_flr'] = 21

In [153]:
# 건물 채수 카테고리화

for dataset in train_data:
    dataset.loc[dataset['bldng_cnt'] == 0, 'bldng_cnt'] = 0,
    dataset.loc[(dataset['bldng_cnt'] >= 1) &  (dataset['bldng_cnt'] <= 5), 'bldng_cnt'] = 5, 
    dataset.loc[(dataset['bldng_cnt'] > 5) &  (dataset['bldng_cnt'] <= 10), 'bldng_cnt'] = 10,
    dataset.loc[(dataset['bldng_cnt'] > 10) &  (dataset['bldng_cnt'] <= 20), 'bldng_cnt'] = 20,
    dataset.loc[(dataset['bldng_cnt'] > 20) &  (dataset['bldng_cnt'] <= 30), 'bldng_cnt'] = 30,
    dataset.loc[dataset['bldng_cnt'] > 30, 'bldng_cnt'] = 31

### + nan 인지 아닌지 여부를 나타내는 column 추가 (그냥 해본 것) 

In [154]:
train.isnull().sum()

dt_of_fr                        0
fr_yn                           0
bldng_us                    27677
bldng_archtctr              27665
bldng_cnt                       0
                            ...  
us_yn                       49447
dngrs_thng_yn               49447
slf_fr_brgd_yn              49447
blk_dngrs_thng_mnfctr_yn    49447
cltrl_hrtg_yn               49447
Length: 180, dtype: int64

In [155]:
new_train = train.copy()

cols_with_missing = (col for col in new_train.columns 
                                 if new_train[col].isnull().any())
for col in cols_with_missing:
    new_train[col + '_was_missing'] = new_train[col].isnull()

In [156]:
new_train.head()

Unnamed: 0,dt_of_fr,fr_yn,bldng_us,bldng_archtctr,bldng_cnt,bldng_ar,ttl_ar,lnd_ar,dt_of_athrztn,ttl_grnd_flr,...,bldng_ar_prc_was_missing,fr_mn_cnt_was_missing,trgt_crtr_was_missing,fr_fghtng_fclt_spcl_css_5_yn_was_missing,fr_fghtng_fclt_spcl_css_6_yn_was_missing,us_yn_was_missing,dngrs_thng_yn_was_missing,slf_fr_brgd_yn_was_missing,blk_dngrs_thng_mnfctr_yn_was_missing,cltrl_hrtg_yn_was_missing
0,2017-10-20 05:54,Y,단독주택,블록구조,5,69.42,69.42,0.0,,10.0,...,True,False,True,True,True,True,True,True,True,True
1,2018-09-30 08:26,N,,,5,46.29,46.29,0.0,,10.0,...,False,False,True,True,True,True,True,True,True,True
2,2016-10-30 14:57,Y,공동주택,철근콘크리트구조,5,583.8,2516.76,1446.0,19.0,10.0,...,False,False,True,True,True,True,True,True,True,True
3,2016-06-14 05:23,N,단독주택,일반목구조,5,48.92,48.92,0.0,,10.0,...,False,False,True,True,True,True,True,True,True,True
4,2018-04-22 05:38,N,,,5,0.0,0.0,0.0,,,...,True,False,True,True,True,True,True,True,True,True


## 3. 최종 pipeline

In [157]:
building_categorical_feature_1 = ['bldng_us', 'bldng_archtctr', 'bldng_us_clssfctn']
building_categorical_feature_2 = ['ttl_grnd_flr', 'ttl_dwn_flr', 'bldng_cnt']
building_numeric_feature = ['bldng_ar', 'ttl_ar', 'lnd_ar']
building_date = ['dt_of_athrztn']

building_categorical_transformer_1 = Pipeline(steps = [
    ('imp', SimpleImputer(strategy = 'constant', fill_value = 'blank')),
    ('OneHotEncoer', OneHotEncoder(handle_unknown = 'ignore', sparse = False))
])

building_categorical_transformer_2 = Pipeline(steps = [
    ('zero', SimpleImputer(strategy = 'constant', fill_value = 0)),
    ('OneHotEncoer', OneHotEncoder(handle_unknown = 'ignore', sparse = False))
])

building_numeric_transformer = Pipeline(steps = [
    ('scaling', StandardScaler()),
    ('pca', PCA()),
])


building_date_transformer = Pipeline([
    ('imp', SimpleImputer(strategy = 'constant', fill_value = 0))
])


processor = ColumnTransformer(
    transformers = [
    ('cat1', building_categorical_transformer_1, building_categorical_feature_1),
    ('cat2', building_categorical_transformer_2, building_categorical_feature_2),
    ('num', building_numeric_transformer, building_numeric_feature),
    ('date', building_date_transformer, building_date)
])


pipe = Pipeline(steps = [('processor', processor)
])

In [158]:
pipe.fit_transform(train)

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -8.80615594e-03, -3.14938998e-02,  0.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -8.54311329e-03, -3.32894353e-02,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         6.89478937e-03,  4.49195465e-03,  1.90000000e+01],
       ...,
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.66272430e-02,  1.50212322e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -1.20899012e-02, -1.47619928e-02,  1.40000000e+01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -1.00454212e-02, -2.39094719e-02,  1.70000000e+01]])

In [159]:
pipe.fit_transform(test)

array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.55668817e-02, -3.02043555e-02,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -2.30308171e-02,  2.97517406e-02,  1.80000000e+01],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         2.02863515e-02,  3.01371394e-01,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.49151746e-02, -2.13867078e-02,  7.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.14238219e-02, -1.37903765e-02,  1.00000000e+01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.53647625e-02,  7.63189983e-02,  2.10000000e+01]])

In [160]:
pipe.fit_transform(validation)

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.59325730e-02, -2.26087926e-02,  3.30000000e+01],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.65925114e-02, -2.09072409e-02,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.71128275e-02, -2.12964202e-02,  1.30000000e+01],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.58089422e-02, -2.14643832e-02,  1.90000000e+01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.70887524e-02, -2.04769465e-02,  2.60000000e+01],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
        -2.66056182e-02, -1.08727928e-02,  1.60000000e+01]])

## 추가: imputation하는 다른 방법들

참고: https://data-newbie.tistory.com/257

제가 맡은 변수에서는 nan 개수가 유의미하게 줄어들지 않았습니다. 효과는 잘 모르겠습니다...

### 1) using datawig

https://pypi.org/project/datawig/

In [19]:
from datawig import SimpleImputer
from datawig.utils import random_split
from sklearn.metrics import f1_score, classification_report
df_train, df_test = random_split(train, split_ratios=[0.8, 0.2])
# Initialize a SimpleImputer model
imputer = SimpleImputer(
input_columns=['bldng_cnt', 'bldng_ar','ttl_ar', 'lnd_ar', 'ground_floor', 'down_floor'],  # columns containing information about 
output_column='bldng_us',  # the column we'd like to impute values for
output_path='imputer_model'  # stores model data and metrics
)

# Fit an imputer model on the train data
imputer.fit(train_df= df_train, num_epochs=10)

# Impute missing values and return original dataframe with predictions
predictions = imputer.predict(df_test)

2019-11-30 11:49:28,103 [INFO]  CategoricalEncoder for column bldng_us                                found only 95 occurrences of value 업무시설
2019-11-30 11:49:28,104 [INFO]  CategoricalEncoder for column bldng_us                                found only 90 occurrences of value 위험물저장및처리시설
2019-11-30 11:49:28,106 [INFO]  CategoricalEncoder for column bldng_us                                found only 83 occurrences of value 자동차관련시설
2019-11-30 11:49:28,107 [INFO]  CategoricalEncoder for column bldng_us                                found only 42 occurrences of value 판매시설
2019-11-30 11:49:28,108 [INFO]  CategoricalEncoder for column bldng_us                                found only 38 occurrences of value 분뇨.쓰레기처리시설
2019-11-30 11:49:28,109 [INFO]  CategoricalEncoder for column bldng_us                                found only 34 occurrences of value 운동시설
2019-11-30 11:49:28,110 [INFO]  CategoricalEncoder for column bldng_us                                found only 27 occurrences of va

2019-11-30 11:49:45,452 [INFO]  Saved checkpoint to "imputer_model\model-0007.params"
2019-11-30 11:49:45,599 [INFO]  Epoch[7] Validation-cross-entropy=1.097753
2019-11-30 11:49:45,600 [INFO]  Epoch[7] Validation-bldng_us-accuracy=0.690951
2019-11-30 11:49:46,525 [INFO]  Epoch[8] Batch [0-708]	Speed: 12292.54 samples/sec	cross-entropy=1.058386	bldng_us-accuracy=0.691379
2019-11-30 11:49:47,356 [INFO]  Epoch[8] Train-cross-entropy=1.037747
2019-11-30 11:49:47,358 [INFO]  Epoch[8] Train-bldng_us-accuracy=0.700779
2019-11-30 11:49:47,360 [INFO]  Epoch[8] Time cost=1.759
2019-11-30 11:49:47,368 [INFO]  Saved checkpoint to "imputer_model\model-0008.params"
2019-11-30 11:49:47,502 [INFO]  Epoch[8] Validation-cross-entropy=1.097040
2019-11-30 11:49:47,503 [INFO]  Epoch[8] Validation-bldng_us-accuracy=0.690951
2019-11-30 11:49:48,359 [INFO]  Epoch[9] Batch [0-708]	Speed: 13292.21 samples/sec	cross-entropy=1.056362	bldng_us-accuracy=0.690762
2019-11-30 11:49:49,251 [INFO]  Epoch[9] Train-cross-

In [20]:
imputer.fit(train)

2019-11-30 11:49:58,282 [INFO]  
  allow_missing=allow_missing, force_init=force_init)
2019-11-30 11:49:59,824 [INFO]  Epoch[0] Batch [0-886]	Speed: 9235.20 samples/sec	cross-entropy=1.056065	bldng_us-accuracy=0.693842
2019-11-30 11:50:01,128 [INFO]  Epoch[0] Train-cross-entropy=1.044258
2019-11-30 11:50:01,130 [INFO]  Epoch[0] Train-bldng_us-accuracy=0.698446
2019-11-30 11:50:01,131 [INFO]  Epoch[0] Time cost=2.845
2019-11-30 11:50:01,140 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2019-11-30 11:50:01,346 [INFO]  Epoch[0] Validation-cross-entropy=1.089418
2019-11-30 11:50:01,348 [INFO]  Epoch[0] Validation-bldng_us-accuracy=0.675565
2019-11-30 11:50:02,593 [INFO]  Epoch[1] Batch [0-886]	Speed: 11425.64 samples/sec	cross-entropy=1.048191	bldng_us-accuracy=0.694687
2019-11-30 11:50:03,757 [INFO]  Epoch[1] Train-cross-entropy=1.039151
2019-11-30 11:50:03,758 [INFO]  Epoch[1] Train-bldng_us-accuracy=0.699294
2019-11-30 11:50:03,759 [INFO]  Epoch[1] Time cost=2.410
2019-1

2019-11-30 11:50:34,276 [INFO]  Epoch[13] Train-cross-entropy=1.011944
2019-11-30 11:50:34,278 [INFO]  Epoch[13] Train-bldng_us-accuracy=0.703884
2019-11-30 11:50:34,279 [INFO]  Epoch[13] Time cost=2.172
2019-11-30 11:50:34,287 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2019-11-30 11:50:34,460 [INFO]  Epoch[13] Validation-cross-entropy=1.074693
2019-11-30 11:50:34,461 [INFO]  Epoch[13] Validation-bldng_us-accuracy=0.676193
2019-11-30 11:50:35,532 [INFO]  Epoch[14] Batch [0-886]	Speed: 13281.03 samples/sec	cross-entropy=1.024372	bldng_us-accuracy=0.696167
2019-11-30 11:50:36,627 [INFO]  Epoch[14] Train-cross-entropy=1.009973
2019-11-30 11:50:36,628 [INFO]  Epoch[14] Train-bldng_us-accuracy=0.703531
2019-11-30 11:50:36,629 [INFO]  Epoch[14] Time cost=2.167
2019-11-30 11:50:36,637 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2019-11-30 11:50:36,794 [INFO]  Epoch[14] Validation-cross-entropy=1.072872
2019-11-30 11:50:36,796 [INFO]  Epoch[14] Validation-b

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


<datawig.simple_imputer.SimpleImputer at 0x12a8cea7be0>

### 2) using fancyimpute

https://pypi.org/project/fancyimpute/

In [57]:
from fancyimpute import SoftImpute
import pandas as pd

class SoftImputeDf(SoftImpute):
    """DataFrame Wrapper around SoftImpute"""

    def __init__(self, shrinkage_value=None, convergence_threshold=0.001,
                 max_iters=100,max_rank=None,n_power_iterations=1,init_fill_method="zero",
                 min_value=None,max_value=None,normalizer=None,verbose=True):

        super(SoftImputeDf, self).__init__(shrinkage_value=shrinkage_value, 
                                           convergence_threshold=convergence_threshold,
                                           max_iters=max_iters,max_rank=max_rank,
                                           n_power_iterations=n_power_iterations,
                                           init_fill_method=init_fill_method,
                                           min_value=min_value,max_value=max_value,
                                           normalizer=normalizer,verbose=False)



    def fit_transform(self, X, y=None):

        assert isinstance(X, pd.DataFrame), "Must be pandas dframe"

        for col in X.columns:
            if X[col].isnull().sum() < 10:
                X[col].fillna(0.0, inplace=True)

        z = super(SoftImputeDf, self).fit_transform(X.values)
        df = pd.DataFrame(z, index=X.index, columns=X.columns)
        cats = list(X.select_dtypes(include='category'))
        df[cats] = df[cats].astype('category')

        # return pd.DataFrame(z, index=X.index, columns=X.columns)
        return df

In [58]:
SoftImputeDf(train)

SoftImputeDf(convergence_threshold=0.001, fill_method='zero', max_iters=100, max_rank=None, max_value=None, min_value=None, n_power_iterations=1, normalizer=None, verbose=False)