In [1]:
# setting
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import datetime

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer                          

from sklearn.compose import ColumnTransformer,make_column_transformer

from sklearn.decomposition import PCA



In [2]:
work_dir = '/Users/ieunpyo/PycharmProjects/Kaggle/gimhae_fire/'

train = pd.read_csv((work_dir + 'PJT002_train.csv'),encoding='utf-8' )
validation = pd.read_csv((work_dir + 'PJT002_validation .csv'),encoding='utf-8' )
test = pd.read_csv((work_dir + '/' + 'PJT002_test.csv'),encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


### Feature List

* date_ var : 'date_of_fr'

* y_var : 'fr_yn'

* weather var : 'tmprtr','prcpttn','wnd_spd','wnd_drctn','hmdt'

* land_var : 'jmk','rgnl_ar_nm','rgnl_ar_nm2','lnd_us_sttn_nm','rd_sd_nm'



* 성호님 변수 :'fr_sttn_dstnc','bldng_ar_prc','fr_wthr_fclt_dstnc','fr_mn_cnt','mlt_us_yn','cctv_dstnc','fr_wthr_fclt_in_100m','cctv_in_100m',
'tbc_rtl_str_dstnc','sft_emrgnc_bll_dstnc','ahsm_dstnc','no_tbc_zn_dstnc','bldng_cnt_in_50m'

### Date 변수 처리

In [3]:
def dt_of_fr_transform(df) :
    year_list = []
    month_list =  []
    day_list = []
    weekday_list = []
    hour_list = []
    
    season_list = []
    
    for i in range(len(df)) : 
        date_0=train.dt_of_fr[i].split(' ')[0] # '2017-10-20'
        time_0=train.dt_of_fr[i].split(' ')[1] # '05:54:00'
        
        year = int(date_0.split('-')[0]) # 2017
        month = int(date_0.split('-')[1]) # 10
        day = int(date_0.split('-')[2]) # 20
        weekday = datetime.date(year,month,day).weekday() # 0 : 월~ 6 : 일
        
        hour = int(time_0.split(':')[0]) # 05
        
        
        if month in [3,4,5] :
            season = 0
        elif month in [6,7,8] :
            season = 1
        elif month in [9,10,11] :
            season = 2 
        else :
            season =3
            
        year_list.append(year)
        month_list.append(month)
        day_list.append(day)
        weekday_list.append(weekday)
        season_list.append(season)
        
        hour_list.append(hour)
        
    df['year'] = year_list
    df['month'] = month_list
    df['day'] = day_list
    df['weekday'] = weekday_list
    df['season'] = season_list
    df['hour'] = hour_list
    # 다른 분들이 dt_of_fr 쓰실지 몰라서 일단은 drop 보류.
    # df = df.drop(['dt_of_fr'],axis=1)
    return df

In [4]:
# 함수 적용
train = dt_of_fr_transform(train)
test = dt_of_fr_transform(test)
validation = dt_of_fr_transform(validation)

### fr_yn 변수 처리

In [5]:
train['fr_yn'] = pd.get_dummies(train['fr_yn'])['Y']
validation['fr_yn'] = pd.get_dummies(validation['fr_yn'])['Y']

### weather 변수 처리

##### 'tmprtr','prcpttn','wnd_spd','wnd_drctn','hmdt'

In [6]:
# 만약 이 중 변수를 제거하고 싶으면 여기 리스트에서 이름을 삭제하면 된다.
# pipeline 안에 넣고 싶었지만 변수를 중간에 제거하는 것은 파이프라인에 안 들어간다. 도저히... 안된다고 한다...
weather_var_1 = ['prcpttn']
weather_var_2 = ['tmprtr','wnd_spd','wnd_drctn','hmdt'] 
weather_var = weather_var_1 + weather_var_2

weather_imputer = ColumnTransformer([
    ('prcpttn_imputer',SimpleImputer(strategy='constant',fill_value=-1),weather_var_1),
    ('otehrs_imputer',IterativeImputer(),weather_var_2)
     ])

weather_scale_PCA = Pipeline([
    ('Scaler',StandardScaler()),
    ('PCA',PCA())
])

weather_pipe = Pipeline([
    ('weather_imputer',weather_imputer),
    ('weather_scale_PCA',weather_scale_PCA)
    
])

In [7]:
weather_pipe.fit_transform(train)

array([[ 0.83696482, -0.06593299,  0.27633869, -1.05562882, -0.48734151],
       [ 1.27023595, -0.08216013, -0.70304549,  0.42325286,  0.22044418],
       [-1.04874847, -0.52521494, -0.44697804,  1.40230662,  0.76804871],
       ...,
       [-1.28843737,  0.21226355, -0.011177  ,  0.03247435, -0.63159873],
       [ 0.11110034,  1.62059609,  0.87102948, -1.51675953,  0.06328016],
       [ 0.78164759,  0.23277003, -0.15921767, -0.97506574, -0.14958609]])

In [8]:
weather_pipe.fit_transform(validation)

array([[ 0.94063744, -0.4778311 , -0.15723154,  0.51032937, -0.22702357],
       [ 1.98512404, -0.40265732,  0.06949097, -0.13068358, -0.51807421],
       [-0.55312867, -0.24893418,  1.63349464, -0.3584252 , -0.2100535 ],
       ...,
       [ 1.57226778,  0.64638803, -0.72067563, -0.34062055, -0.40175184],
       [-2.23494975, -0.18298893,  1.20507098,  0.56197559, -0.24398495],
       [-0.91382092,  0.26806252,  0.48812327, -0.50571881, -0.69183157]])

In [9]:
weather_pipe.fit_transform(test)

array([[-2.43852422e+00,  2.73587519e-01, -4.29375211e-01,
         6.60203901e-01, -2.84419342e-03],
       [ 4.43239039e-01, -3.67840424e-01, -2.65739699e-01,
         7.84402750e-01, -3.33679053e-01],
       [ 1.24490978e+00,  4.43525516e+00,  2.77708068e+00,
         3.99014900e-01,  2.06565294e-01],
       ...,
       [ 4.05699582e-02,  1.39168587e+00, -4.41421700e-01,
        -1.08893974e+00, -3.34892652e-01],
       [ 6.59174478e-01, -5.13874661e-01, -6.38967952e-01,
         7.30574041e-01,  7.44641823e-01],
       [ 2.03829082e+00, -2.57184386e-01, -3.26852698e-01,
        -5.11989832e-01,  1.48018455e-03]])

## land 변수 처리

#####  'rgnl_ar_nm','rgnl_ar_nm2','jmk','lnd_us_sttn_nm','rd_sd_nm'

### 'rgnl_ar_nm','rgnl_ar_nm2','jmk','lnd_us_sttn_nm','rd_sd_nm' 공통

In [10]:
def land_transform_step_1(df) :
    # rgnl_ar_nm, rgnl_ar_nm, jmk,lnd_us_sttn_nm,rd_sd_nm 행의 NA 값들을 모두 'blank'라는 새로운 범주로 지정해줌
    # (참고로 rgnl_ar_nm 이 NA인 경우 모두 rgnl_ar_nm2 값 또한 NA)
    values = {'rgnl_ar_nm': 'blank', 'rgnl_ar_nm2': 'blank','jmk':'blank','lnd_us_sttn_nm':'blank','rd_sd_nm':'blank'}
    df = df.fillna(value=values)
    
    # rgnl_ar_nm2 처리1 : rgnl_ar_nm2가 지정되지 않은 경우 그 행의 rgnl_ar_nm1의 값을 따르게 함.
    ix = df[ df['rgnl_ar_nm2']=='지정되지않음' ].index
    df.loc[ix,'rgnl_ar_nm2'] = df.loc[ix,'rgnl_ar_nm']
    return df

In [11]:
train = land_transform_step_1(train)
test = land_transform_step_1(test)
validation = land_transform_step_1(validation)

In [12]:
# train, test, validation set마다 포함하고 있는 범주가 조금씩 달라서 한번에 묶어서 기준 세움

# rgnl_ar_nm, rgnl_ar_nm1 변수의 경우 rgnl_ar_nm_uniques 가지고 fitting하도록 함.
rgnl_ar_nm_uniques = train.rgnl_ar_nm.append(train.rgnl_ar_nm2).append(test.rgnl_ar_nm).append(test.rgnl_ar_nm2).append(validation.rgnl_ar_nm).append(validation.rgnl_ar_nm2).unique()
rgnl_ar_nm_uniques = rgnl_ar_nm_uniques.reshape(-1,1)

# 'jmk','lnd_us_sttn_nm','rd_sd_nm' 또한 마찬가지
jmk_uniques = train.jmk.append(test.jmk).append(validation.jmk).unique()
jmk_uniques = jmk_uniques.reshape(-1,1)

lnd_us_sttn_nm_uniques = train.lnd_us_sttn_nm.append(test.lnd_us_sttn_nm).append(validation.lnd_us_sttn_nm).unique()
lnd_us_sttn_nm_uniques = lnd_us_sttn_nm_uniques.reshape(-1,1)

rd_sd_nm_uniques = train.rd_sd_nm.append(test.rd_sd_nm).append(validation.rd_sd_nm).unique()
rd_sd_nm_uniques = rd_sd_nm_uniques.reshape(-1,1)



print(rgnl_ar_nm_uniques)
print(jmk_uniques)
print(lnd_us_sttn_nm_uniques)
print(rd_sd_nm_uniques)

[['자연녹지지역']
 ['계획관리지역']
 ['제2종일반주거지역']
 ['제1종일반주거지역']
 ['자연환경보전지역']
 ['보전관리지역']
 ['일반상업지역']
 ['생산관리지역']
 ['준주거지역']
 ['생산녹지지역']
 ['농림지역']
 ['blank']
 ['준공업지역']
 ['일반공업지역']
 ['근린상업지역']
 ['제1종전용주거지역']
 ['개발제한구역']
 ['제3종일반주거지역']
 ['제2종전용주거지역']
 ['보전녹지지역']
 ['유통상업지역']
 ['중심상업지역']
 ['관리지역']
 ['용도미지정']]
[['대']
 ['답']
 ['임']
 ['종']
 ['장']
 ['전']
 ['잡']
 ['목']
 ['도']
 ['천']
 ['양']
 ['구']
 ['과']
 ['체']
 ['창']
 ['학']
 ['원']
 ['공']
 ['유']
 ['수']
 ['주']
 ['철']
 ['묘']
 ['제']
 ['차']
 ['사']]
[['단독']
 ['연립']
 ['자연림']
 ['주거기타']
 ['아파트']
 ['상업용']
 ['주거나지']
 ['공업용']
 ['답']
 ['업무용']
 ['주상용']
 ['주상나지']
 ['전기타']
 ['위험시설']
 ['전']
 ['조림']
 ['blank']
 ['답기타']
 ['과수원']
 ['토지임야']
 ['공업기타']
 ['골프장 회원제']
 ['주상기타']
 ['다세대']
 ['도로등']
 ['상업기타']
 ['임야기타']
 ['유원지']
 ['운동장등']
 ['공원등']
 ['하천등']
 ['유해.혐오시설']
 ['공업나지']
 ['기타']
 ['상업나지']
 ['주차장등']
 ['목장용지']
 ['여객자동차터미널']
 ['고속도로휴게소']
 ['스키장']
 ['특수기타']
 ['콘도미니엄']
 ['공원묘지']
 ['골프장 대중제']
 ['발전소']
 ['경마장']]
[['세로한면(불)']
 ['세로한면(가)']
 ['중로한면']
 ['소로한면']
 ['맹지']
 ['광대로한면']
 ['세로각

In [15]:
def land_transform_step_2(df):
    #################################################################################
    # rgnl_ar_nm, rgnl_ar_nm2 처리 
    
    
    # train, test, validation set마다 포함하고 있는 범주가 조금씩 달라서 한번에 묶어서 기준 세움
    oh_enc_rgnl_ar_nm = OneHotEncoder()
    oh_enc_rgnl_ar_nm.fit(rgnl_ar_nm_uniques)
    # category 이름 list
    cat_list_rgnl = oh_enc_rgnl_ar_nm.__dict__['categories_']
    
    # encoder 들어갈 수 있게 모양 잡아줌.
    ar1 = np.array(df.rgnl_ar_nm).reshape(-1,1)
    ar2 = np.array(df.rgnl_ar_nm).reshape(-1,1)
    
    # onehot encoding
    ar1_onehot = oh_enc_rgnl_ar_nm.transform(ar1)
    ar2_onehot = oh_enc_rgnl_ar_nm.transform(ar2)
    
    # 더해주고 2로 나누고 본 데이터에 붙이고 rgnl_ar_nm, rgnl_ar_nm2 drop
    onehot = (ar1_onehot/2+ar2_onehot/2).toarray()
    df = df.drop(['rgnl_ar_nm','rgnl_ar_nm2'],axis=1)
    df = pd.concat([df,pd.DataFrame(onehot,columns=cat_list_rgnl)],axis=1)
    
    return df
    
    
    

In [16]:
train = land_transform_step_2(train)
test= land_transform_step_2(test)
validation = land_transform_step_2(validation)

### 'jmk','lnd_us_sttn_nm','rd_sd_nm'

In [18]:
land_left = ['jmk','lnd_us_sttn_nm','rd_sd_nm']

land_ohe = ColumnTransformer([
    ('land_left_ohe',OneHotEncoder(),land_left)
     ])

In [19]:
land_ohe.fit_transform(train).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])