In [1]:
# setting
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import datetime

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer                          

from sklearn.compose import ColumnTransformer,make_column_transformer

from sklearn.decomposition import PCA



In [2]:
work_dir = '/Users/ieunpyo/PycharmProjects/Kaggle/gimhae_fire/'

train = pd.read_csv((work_dir + 'PJT002_train.csv'),encoding='utf-8' )
validation = pd.read_csv((work_dir + 'PJT002_validation .csv'),encoding='utf-8' )
test = pd.read_csv((work_dir + '/' + 'PJT002_test.csv'),encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


# 1단계

### Date 변수 처리

In [3]:
def dt_of_fr_transform(df) :
    year_list = []
    month_list =  []
    day_list = []
    weekday_list = []
    hour_list = []
    
    season_list = []
    
    for i in range(len(df)) : 
        date_0=train.dt_of_fr[i].split(' ')[0] # '2017-10-20'
        time_0=train.dt_of_fr[i].split(' ')[1] # '05:54:00'
        
        year = int(date_0.split('-')[0]) # 2017
        month = int(date_0.split('-')[1]) # 10
        day = int(date_0.split('-')[2]) # 20
        weekday = datetime.date(year,month,day).weekday() # 0 : 월~ 6 : 일
        
        hour = int(time_0.split(':')[0]) # 05
        
        
        if month in [3,4,5] :
            season = 0
        elif month in [6,7,8] :
            season = 1
        elif month in [9,10,11] :
            season = 2 
        else :
            season =3
            
        year_list.append(year)
        month_list.append(month)
        day_list.append(day)
        weekday_list.append(weekday)
        season_list.append(season)
        
        hour_list.append(hour)
        
    df['year'] = year_list
    df['month'] = month_list
    df['day'] = day_list
    df['weekday'] = weekday_list
    df['season'] = season_list
    df['hour'] = hour_list
    # 다른 분들이 dt_of_fr 쓰실지 몰라서 일단은 drop 보류.
    # df = df.drop(['dt_of_fr'],axis=1)
    return df

In [4]:
# 함수 적용
train = dt_of_fr_transform(train)
test = dt_of_fr_transform(test)
validation = dt_of_fr_transform(validation)

### fr_yn 변수 처리

In [5]:
train['fr_yn'] = pd.get_dummies(train['fr_yn'])['Y']
validation['fr_yn'] = pd.get_dummies(validation['fr_yn'])['Y']

### 'rgnl_ar_nm','rgnl_ar_nm2','jmk','lnd_us_sttn_nm','rd_sd_nm'

In [6]:
def land_transform_step_1(df) :
    # rgnl_ar_nm, rgnl_ar_nm, jmk,lnd_us_sttn_nm,rd_sd_nm 행의 NA 값들을 모두 'blank'라는 새로운 범주로 지정해줌
    # (참고로 rgnl_ar_nm 이 NA인 경우 모두 rgnl_ar_nm2 값 또한 NA)
    values = {'rgnl_ar_nm': 'blank', 'rgnl_ar_nm2': 'blank','jmk':'blank','lnd_us_sttn_nm':'blank','rd_sd_nm':'blank'}
    df = df.fillna(value=values)
    
    # rgnl_ar_nm2 처리1 : rgnl_ar_nm2가 지정되지 않은 경우 그 행의 rgnl_ar_nm1의 값을 따르게 함.
    ix = df[ df['rgnl_ar_nm2']=='지정되지않음' ].index
    df.loc[ix,'rgnl_ar_nm2'] = df.loc[ix,'rgnl_ar_nm']
    return df

In [7]:
train = land_transform_step_1(train)
test = land_transform_step_1(test)
validation = land_transform_step_1(validation)

In [8]:
# train, test, validation set마다 포함하고 있는 범주가 조금씩 달라서 한번에 묶어서 기준 세움

# rgnl_ar_nm, rgnl_ar_nm2 변수의 경우 rgnl_ar_nm_uniques 가지고 fitting하도록 함.
rgnl_ar_nm_uniques = train.rgnl_ar_nm.append(train.rgnl_ar_nm2).append(test.rgnl_ar_nm).append(test.rgnl_ar_nm2).append(validation.rgnl_ar_nm).append(validation.rgnl_ar_nm2).unique()
rgnl_ar_nm_uniques = rgnl_ar_nm_uniques.reshape(-1,1)

# 'jmk','lnd_us_sttn_nm','rd_sd_nm' 또한 마찬가지
jmk_uniques = train.jmk.append(test.jmk).append(validation.jmk).unique()
jmk_uniques = jmk_uniques.reshape(-1,1)

lnd_us_sttn_nm_uniques = train.lnd_us_sttn_nm.append(test.lnd_us_sttn_nm).append(validation.lnd_us_sttn_nm).unique()
lnd_us_sttn_nm_uniques = lnd_us_sttn_nm_uniques.reshape(-1,1)

rd_sd_nm_uniques = train.rd_sd_nm.append(test.rd_sd_nm).append(validation.rd_sd_nm).unique()
rd_sd_nm_uniques = rd_sd_nm_uniques.reshape(-1,1)

'''
print(rgnl_ar_nm_uniques)
print(jmk_uniques)
print(lnd_us_sttn_nm_uniques)
print(rd_sd_nm_uniques)
'''

'\nprint(rgnl_ar_nm_uniques)\nprint(jmk_uniques)\nprint(lnd_us_sttn_nm_uniques)\nprint(rd_sd_nm_uniques)\n'

In [9]:
def land_transform_step_2(df):
    #################################################################################
    # rgnl_ar_nm, rgnl_ar_nm2 처리 
    # train, test, validation set마다 포함하고 있는 범주가 조금씩 달라서 한번에 묶어서 기준 세움
    
    oh_enc_rgnl_ar_nm = OneHotEncoder()
    oh_enc_rgnl_ar_nm.fit(rgnl_ar_nm_uniques)
    
    # category 이름 list
    cat_list_rgnl = oh_enc_rgnl_ar_nm.__dict__['categories_']
    
    # encoder 들어갈 수 있게 모양 잡아줌.
    ar1 = np.array(df.rgnl_ar_nm).reshape(-1,1)
    ar2 = np.array(df.rgnl_ar_nm2).reshape(-1,1)
    
    # onehot encoding
    ar1_onehot = oh_enc_rgnl_ar_nm.transform(ar1)
    ar2_onehot = oh_enc_rgnl_ar_nm.transform(ar2)
    
    # 더해주고 2로 나누고 본 데이터에 붙이고 rgnl_ar_nm, rgnl_ar_nm2 drop
    onehot = (ar1_onehot/2+ar2_onehot/2).toarray()
    df = df.drop(['rgnl_ar_nm','rgnl_ar_nm2'],axis=1)
    df = pd.concat([df,pd.DataFrame(onehot,columns=cat_list_rgnl)],axis=1)
    
    #################################################################################
    # jmk, lnd_us_sttn_nm, rd_sd_nm 처리
    # 위와 같은 방법
    
    # jmk
    oh_enc_jmk = OneHotEncoder()
    oh_enc_jmk.fit(jmk_uniques)
    
    cat_list_jmk = oh_enc_jmk.__dict__['categories_']
    jmk_arr = np.array(df.jmk).reshape(-1,1)
    jmk_onehot = oh_enc_jmk.transform(jmk_arr)
    df = df.drop(['jmk'],axis=1)
    df = pd.concat([df,pd.DataFrame(jmk_onehot,columns=cat_list_jmk)],axis=1)
    
    #lnd_us_sttn_nm
    oh_enc_lnd_us_sttn_nm = OneHotEncoder()
    oh_enc_lnd_us_sttn_nm.fit(lnd_us_sttn_nm_uniques)

    cat_list_lnd_us_sttn_nm = oh_enc_lnd_us_sttn_nm.__dict__['categories_']
    lnd_us_sttn_nm_arr = np.array(df.lnd_us_sttn_nm).reshape(-1,1)
    lnd_us_sttn_nm_onehot = oh_enc_lnd_us_sttn_nm.transform(lnd_us_sttn_nm_arr)
    df = df.drop(['lnd_us_sttn_nm'],axis=1)
    df = pd.concat([df,pd.DataFrame(lnd_us_sttn_nm_onehot,columns=cat_list_lnd_us_sttn_nm)],axis=1)
    
    #rd_sd_nm
    oh_enc_rd_sd_nm = OneHotEncoder()
    oh_enc_rd_sd_nm.fit(rd_sd_nm_uniques)
    
    cat_list_rd_sd_nm = oh_enc_rd_sd_nm.__dict__['categories_']
    rd_sd_nm_arr = np.array(df.rd_sd_nm).reshape(-1,1)
    rd_sd_nm_onehot = oh_enc_rd_sd_nm.transform(rd_sd_nm_arr)
    df = df.drop(['rd_sd_nm'],axis=1)
    df = pd.concat([df,pd.DataFrame(rd_sd_nm_onehot,columns=cat_list_rd_sd_nm)],axis=1)

    return df
    
    
    

In [None]:
train = land_transform_step_2(train)
test= land_transform_step_2(test)
validation = land_transform_step_2(validation)

In [None]:
train.head()

In [116]:
land_ohe = ColumnTransformer([
    ('oh_enc_jmk',oh_enc_jmk,['jmk']),
    ('oh_lnd_us_sttn_nm',oh_enc_lnd_us_sttn_nm,['lnd_us_sttn_nm']),
    ('oh_rd_sd_nm',oh_enc_rd_sd_nm,['rd_sd_nm'])
     ])


In [92]:
oh_enc_jmk.categories_

[array(['공', '과', '구', '답', '대', '도', '목', '묘', '사', '수', '양', '원', '유',
        '임', '잡', '장', '전', '제', '종', '주', '차', '창', '천', '철', '체', '학'],
       dtype=object)]

In [93]:
oh_enc_lnd_us_sttn_nm.categories_

[array(['blank', '경마장', '고속도로휴게소', '골프장 대중제', '골프장 회원제', '공업기타', '공업나지',
        '공업용', '공원등', '공원묘지', '과수원', '기타', '다세대', '단독', '답', '답기타', '도로등',
        '목장용지', '발전소', '상업기타', '상업나지', '상업용', '스키장', '아파트', '업무용',
        '여객자동차터미널', '연립', '운동장등', '위험시설', '유원지', '유해.혐오시설', '임야기타', '자연림',
        '전', '전기타', '조림', '주거기타', '주거나지', '주상기타', '주상나지', '주상용', '주차장등',
        '콘도미니엄', '토지임야', '특수기타', '하천등'], dtype=object)]

In [94]:
oh_enc_rd_sd_nm.categories_

[array(['blank', '광대로한면', '광대세각', '광대소각', '맹지', '세로각지(가)', '세로각지(불)',
        '세로한면(가)', '세로한면(불)', '소로각지', '소로한면', '중로각지', '중로한면', '지정되지않음'],
       dtype=object)]

In [25]:
train.keys().tolist()

['dt_of_fr',
 'fr_yn',
 'bldng_us',
 'bldng_archtctr',
 'bldng_cnt',
 'bldng_ar',
 'ttl_ar',
 'lnd_ar',
 'dt_of_athrztn',
 'ttl_grnd_flr',
 'ttl_dwn_flr',
 'bldng_us_clssfctn',
 'tmprtr',
 'prcpttn',
 'wnd_spd',
 'wnd_drctn',
 'hmdt',
 'gas_engry_us_201401',
 'ele_engry_us_201401',
 'gas_engry_us_201402',
 'ele_engry_us_201402',
 'gas_engry_us_201403',
 'ele_engry_us_201403',
 'gas_engry_us_201404',
 'ele_engry_us_201404',
 'gas_engry_us_201405',
 'ele_engry_us_201405',
 'gas_engry_us_201406',
 'ele_engry_us_201406',
 'gas_engry_us_201407',
 'ele_engry_us_201407',
 'gas_engry_us_201408',
 'ele_engry_us_201408',
 'gas_engry_us_201409',
 'ele_engry_us_201409',
 'gas_engry_us_201410',
 'ele_engry_us_201410',
 'gas_engry_us_201411',
 'ele_engry_us_201411',
 'gas_engry_us_201412',
 'ele_engry_us_201412',
 'gas_engry_us_201501',
 'ele_engry_us_201501',
 'gas_engry_us_201502',
 'ele_engry_us_201502',
 'gas_engry_us_201503',
 'ele_engry_us_201503',
 'gas_engry_us_201504',
 'ele_engry_us_201504

In [42]:
'''
train_X = np.array(train_2.drop('fr_yn',axis=1))
train_Y = np.array(train_2.fr_yn)

validation_X = np.array(validation_2.drop('fr_yn',axis=1))
validation_Y = np.array(validation_2.fr_yn)

test_X = np.array(test_2)
'''

In [36]:
keep_var = ['tmprtr', 'prcpttn', 'wnd_spd', 'wnd_drctn', 'hmdt','gas_engry_us_201401',
 'ele_engry_us_201401','gas_engry_us_201402', 'ele_engry_us_201402', 'gas_engry_us_201403', 'ele_engry_us_201403', 'gas_engry_us_201404',
 'ele_engry_us_201404',
 'gas_engry_us_201405',
 'ele_engry_us_201405',
 'gas_engry_us_201406',
 'ele_engry_us_201406',
 'gas_engry_us_201407',
 'ele_engry_us_201407',
 'gas_engry_us_201408',
 'ele_engry_us_201408',
 'gas_engry_us_201409',
 'ele_engry_us_201409',
 'gas_engry_us_201410',
 'ele_engry_us_201410',
 'gas_engry_us_201411',
 'ele_engry_us_201411',
 'gas_engry_us_201412',
 'ele_engry_us_201412',
 'gas_engry_us_201501',
 'ele_engry_us_201501',
 'gas_engry_us_201502',
 'ele_engry_us_201502',
 'gas_engry_us_201503',
 'ele_engry_us_201503',
 'gas_engry_us_201504',
 'ele_engry_us_201504',
 'gas_engry_us_201505',
 'ele_engry_us_201505',
 'gas_engry_us_201506',
 'ele_engry_us_201506',
 'gas_engry_us_201507',
 'ele_engry_us_201507',
 'gas_engry_us_201508',
 'ele_engry_us_201508',
 'gas_engry_us_201509',
 'ele_engry_us_201509',
 'gas_engry_us_201510',
 'ele_engry_us_201510',
 'gas_engry_us_201511',
 'ele_engry_us_201511',
 'gas_engry_us_201512',
 'ele_engry_us_201512',
 'gas_engry_us_201601',
 'ele_engry_us_201601',
 'gas_engry_us_201602',
 'ele_engry_us_201602',
 'gas_engry_us_201603',
 'ele_engry_us_201603',
 'gas_engry_us_201604',
 'ele_engry_us_201604',
 'gas_engry_us_201605',
 'ele_engry_us_201605',
 'gas_engry_us_201606',
 'ele_engry_us_201606',
 'gas_engry_us_201607',
 'ele_engry_us_201607',
 'gas_engry_us_201608',
 'ele_engry_us_201608',
 'gas_engry_us_201609',
 'ele_engry_us_201609',
 'gas_engry_us_201610',
 'ele_engry_us_201610',
 'gas_engry_us_201611',
 'ele_engry_us_201611',
 'gas_engry_us_201612',
 'ele_engry_us_201612',
 'gas_engry_us_201701',
 'ele_engry_us_201701',
 'gas_engry_us_201702',
 'ele_engry_us_201702',
 'gas_engry_us_201703',
 'ele_engry_us_201703',
 'gas_engry_us_201704',
 'ele_engry_us_201704',
 'gas_engry_us_201705',
 'ele_engry_us_201705',
 'gas_engry_us_201706',
 'ele_engry_us_201706',
 'gas_engry_us_201707',
 'ele_engry_us_201707',
 'gas_engry_us_201708',
 'ele_engry_us_201708',
 'gas_engry_us_201709',
 'ele_engry_us_201709',
 'gas_engry_us_201710',
 'ele_engry_us_201710',
 'gas_engry_us_201711',
 'ele_engry_us_201711',
 'gas_engry_us_201712',
 'ele_engry_us_201712',
 'gas_engry_us_201801',
 'ele_engry_us_201801',
 'gas_engry_us_201802',
 'ele_engry_us_201802',
 'gas_engry_us_201803',
 'ele_engry_us_201803',
 'gas_engry_us_201804',
 'ele_engry_us_201804',
 'gas_engry_us_201805',
 'ele_engry_us_201805',
 'gas_engry_us_201806',
 'ele_engry_us_201806',
 'gas_engry_us_201807',
 'ele_engry_us_201807',
 'gas_engry_us_201808',
 'ele_engry_us_201808',
 'gas_engry_us_201809',
 'ele_engry_us_201809',
 'gas_engry_us_201810',
 'ele_engry_us_201810',
 'gas_engry_us_201811',
 'ele_engry_us_201811',
 'gas_engry_us_201812',
 'ele_engry_us_201812',
 'lw_13101010',
 'lw_13101110',
 'lw_13101210',
 'lw_13101211',
 'lw_13101310',
 'lw_13101410',
 'lw_13111010',
 'lw_13111110',
 'lw_13121010',
 'lw_13121011',
 'lw_13131010',
 'lw_13131110',
 'lw_13141010',
 'lw_13141011',
 'jmk','lnd_us_sttn_nm', 'rd_sd_nm' ]



train_2 = train[(keep_var+['fr_yn'])]
validation_2 = train[(keep_var+['fr_yn'])]
test_2 = test[keep_var]


In [37]:
train_2.head()

Unnamed: 0,tmprtr,prcpttn,wnd_spd,wnd_drctn,hmdt,gas_engry_us_201401,ele_engry_us_201401,gas_engry_us_201402,ele_engry_us_201402,gas_engry_us_201403,...,lw_13121010,lw_13121011,lw_13131010,lw_13131110,lw_13141010,lw_13141011,jmk,lnd_us_sttn_nm,rd_sd_nm,fr_yn
0,10.7,,0.9,200.0,96.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,대,단독,세로한면(불),1
1,19.5,,0.5,20.0,74.0,,,,,,...,,,,,,,대,단독,세로한면(가),0
2,15.5,,2.0,90.0,21.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,답,연립,중로한면,1
3,20.6,,0.4,0.0,91.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,대,단독,세로한면(불),0
4,12.7,,0.6,360.0,89.0,,,,,,...,,,,,,,임,자연림,세로한면(불),0


In [38]:
validation_2.head()

Unnamed: 0,tmprtr,prcpttn,wnd_spd,wnd_drctn,hmdt,gas_engry_us_201401,ele_engry_us_201401,gas_engry_us_201402,ele_engry_us_201402,gas_engry_us_201403,...,lw_13121010,lw_13121011,lw_13131010,lw_13131110,lw_13141010,lw_13141011,jmk,lnd_us_sttn_nm,rd_sd_nm,fr_yn
0,10.7,,0.9,200.0,96.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,대,단독,세로한면(불),1
1,19.5,,0.5,20.0,74.0,,,,,,...,,,,,,,대,단독,세로한면(가),0
2,15.5,,2.0,90.0,21.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,답,연립,중로한면,1
3,20.6,,0.4,0.0,91.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,대,단독,세로한면(불),0
4,12.7,,0.6,360.0,89.0,,,,,,...,,,,,,,임,자연림,세로한면(불),0


In [39]:
test_2.head()

Unnamed: 0,tmprtr,prcpttn,wnd_spd,wnd_drctn,hmdt,gas_engry_us_201401,ele_engry_us_201401,gas_engry_us_201402,ele_engry_us_201402,gas_engry_us_201403,...,lw_13111110,lw_13121010,lw_13121011,lw_13131010,lw_13131110,lw_13141010,lw_13141011,jmk,lnd_us_sttn_nm,rd_sd_nm
0,6.6,,4.5,290.0,20.0,,,,,,...,,,,,,,,제,blank,blank
1,13.8,,2.2,110.0,62.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,장,공업용,소로한면
2,12.9,6.5,2.2,320.0,97.0,,,,,,...,,,,,,,,장,공업용,세로한면(가)
3,12.4,,0.6,140.0,63.0,,,,,,...,,,,,,,,장,공업용,세로한면(가)
4,22.0,,4.6,290.0,28.0,,,,,,...,,,,,,,,대,전기타,세로한면(가)


# 2단계

## ColumnTransformer 위한 변수 정의

In [47]:
weather_var_1 = ['prcpttn']
weather_var_2 = ['tmprtr','wnd_spd','wnd_drctn','hmdt'] 
weather_var = weather_var_1 + weather_var_2

eg_var = train.loc[:,'gas_engry_us_201401':'ele_engry_us_201812'].keys().tolist()
lw_var = ['lw_13101010','lw_13101110','lw_13101210','lw_13101211','lw_13101310','lw_13101410','lw_13111010','lw_13111110','lw_13121010',
           'lw_13121011','lw_13131010','lw_13131110','lw_13141010','lw_13141011'] 
egl_var = eg_var + lw_var # 모든 관심변수들


land_left = ['jmk','lnd_us_sttn_nm','rd_sd_nm']




## Weather Pipeline

##### 'tmprtr','prcpttn','wnd_spd','wnd_drctn','hmdt'

In [7]:
# 만약 이 중 변수를 제거하고 싶으면 여기 리스트에서 이름을 삭제하면 된다.
# pipeline 안에 넣고 싶었지만 변수를 중간에 제거하는 것은 파이프라인에 안 들어간다. 도저히... 안된다고 한다...
weather_var_1 = ['prcpttn']
weather_var_2 = ['tmprtr','wnd_spd','wnd_drctn','hmdt'] 
weather_var = weather_var_1 + weather_var_2

weather_imputer = ColumnTransformer([
    ('prcpttn_imputer',SimpleImputer(strategy='constant',fill_value=-1),weather_var_1),
    ('otehrs_imputer',IterativeImputer(),weather_var_2)
     ])

weather_scale_PCA = Pipeline([
    ('Scaler',StandardScaler()),
    ('PCA',PCA())
])

weather_pipe = Pipeline([
    ('weather_imputer',weather_imputer),
    ('weather_scale_PCA',weather_scale_PCA)
    
])

## Gas, Ele, Lw Pipeline


###  불이 난 달의 Gas와 Ele 사용량을 열으로 추가하는 변환기를 정의합니다. 



In [109]:
# Train

class TrFmGEAdder(BaseEstimator, TransformerMixin):
  def __init__(self, index = eg_var):
    self.index = ['dt_of_fr'] + index
  
  def fit(self, X,y = None):
    return(self)
  
  def transform(self, X, y = None):
    index = self.index
    X = np.append(np.array(train['dt_of_fr']).reshape(-1,1), X, axis = 1) # X를 받아서 앞에 dt_of_fr열을 추가합니다.
    X = pd.DataFrame(X) # 차후 열 이름을 기준으로 사용해야 하기 때문에 Pandas로 변환합니다. 
    X.columns = train[['dt_of_fr']+eg_var].keys() # 열 이름을 추가합니다. eg_var 대상으로 진행할 것이기 때문에 이렇게 씁니다.

    train_rst = X.assign(
        fr_month_gas  = 'gas'+X['dt_of_fr'].str.slice_replace(start=7, stop=20, repl='').str.replace(pat='-', repl='', regex=False)
        ).assign(
            fr_month_ele  = 'ele'+X['dt_of_fr'].str.slice_replace(start=7, stop=20, repl='').str.replace(pat='-', repl='', regex=False)
            ) # gas/ele + 불이 난 달을 값으로 갖는 열을 생성합니다. 이걸 생성하지 않고 하는 것도 할 수 있을거 같은데 지금은 뇌가 멈췄으니 일단 만듭니다.
    
    tk = train_rst.keys()[np.where(pd.Series(train_rst.keys()) == index[0])[0][0]:np.where(pd.Series(train_rst.keys()) == index[-1])[0][0]+1]
    keys = pd.Series(tk).str.split('_', expand = True).iloc[:,[0,3]]
    keys[1] = keys[0]+keys[3]
    keys[1][0] = 'dt_of_fr' # 변경된 열 이름의 리스트입니다. 가장 앞의 값이 누락되어서 추가해주었습니다.
    
    coln = keys[1].ravel().tolist()+['fr_month_gas','fr_month_ele'] #열 이름을 바꿔줄 준비. 앞선 fr_month_gas/ ele열을 index로 사용하기 위함입니다
    train_rst.columns = coln #열 이름 변경합니다.

    fm_gas_index = train_rst.fr_month_gas.map(lambda x: np.where(x == train_rst.keys())[0][0]) # 불이 난 월의 gas의 열 index
    fm_ele_index = train_rst.fr_month_ele.map(lambda x: np.where(x == train_rst.keys())[0][0]) # 불이 난 월의 ele의 열 index

    train_eg = train_rst.assign(
        gas_fm = np.array(train_rst)[np.arange(train.shape[0]),fm_gas_index.values] # 최종적으로 두 index를 기준으로 값을 대입합니다.
        ).assign(
            ele_fm = np.array(train_rst)[np.arange(train.shape[0]),fm_ele_index.values]
            )[['gas_fm','ele_fm']] # 이 두 열만 남기거나
            #.drop(['fr_month_gas','fr_month_ele','dt_of_fr'], axis = 1) # 이 두 열을 더하거나. 일단은 두 열만 남겼습니다. 
    
    return np.array(train_eg)

In [110]:
#  Val. 

class VFmGEAdder(BaseEstimator, TransformerMixin):
  def __init__(self, index = eg_var):
    self.index = ['dt_of_fr'] + index
  
  def fit(self, X,y = None):
    return(self)
  
  def transform(self, X, y = None):
    index = self.index
    X = np.append(np.array(validation['dt_of_fr']).reshape(-1,1), X, axis = 1) # X를 받아서 앞에 dt_of_fr열을 추가합니다.
    X = pd.DataFrame(X) # 차후 열 이름을 기준으로 사용해야 하기 때문에 Pandas로 변환합니다. 
    X.columns = validation[['dt_of_fr']+eg_var].keys() # 열 이름을 추가합니다. eg_var 대상으로 진행할 것이기 때문에 이렇게 씁니다.

    train_rst = X.assign(
        fr_month_gas  = 'gas'+X['dt_of_fr'].str.slice_replace(start=7, stop=20, repl='').str.replace(pat='-', repl='', regex=False)
        ).assign(
            fr_month_ele  = 'ele'+X['dt_of_fr'].str.slice_replace(start=7, stop=20, repl='').str.replace(pat='-', repl='', regex=False)
            ) # gas/ele + 불이 난 달을 값으로 갖는 열을 생성합니다. 이걸 생성하지 않고 하는 것도 할 수 있을거 같은데 지금은 뇌가 멈췄으니 일단 만듭니다.
    
    tk = train_rst.keys()[np.where(pd.Series(train_rst.keys()) == index[0])[0][0]:np.where(pd.Series(train_rst.keys()) == index[-1])[0][0]+1]
    keys = pd.Series(tk).str.split('_', expand = True).iloc[:,[0,3]]
    keys[1] = keys[0]+keys[3]
    keys[1][0] = 'dt_of_fr' # 변경된 열 이름의 리스트입니다. 가장 앞의 값이 누락되어서 추가해주었습니다.
    
    coln = keys[1].ravel().tolist()+['fr_month_gas','fr_month_ele'] #열 이름을 바꿔줄 준비. 앞선 fr_month_gas/ ele열을 index로 사용하기 위함입니다
    train_rst.columns = coln #열 이름 변경합니다.

    fm_gas_index = train_rst.fr_month_gas.map(lambda x: np.where(x == train_rst.keys())[0][0]) # 불이 난 월의 gas의 열 index
    fm_ele_index = train_rst.fr_month_ele.map(lambda x: np.where(x == train_rst.keys())[0][0]) # 불이 난 월의 ele의 열 index

    train_eg = train_rst.assign(
        gas_fm = np.array(train_rst)[np.arange(validation.shape[0]),fm_gas_index.values] # 최종적으로 두 index를 기준으로 값을 대입합니다.
        ).assign(
            ele_fm = np.array(train_rst)[np.arange(validation.shape[0]),fm_ele_index.values]
            )[['gas_fm','ele_fm']] # 이 두 열만 남기거나
            #.drop(['fr_month_gas','fr_month_ele','dt_of_fr'], axis = 1) # 이 두 열을 더하거나. 일단은 두 열만 남겼습니다. 
    
    return np.array(train_eg)

In [111]:
# Test 

class TeFmGEAdder(BaseEstimator, TransformerMixin):
  def __init__(self, index = eg_var):
    self.index = ['dt_of_fr'] + index
  
  def fit(self, X,y = None):
    return(self)
  
  def transform(self, X, y = None):
    index = self.index
    X = np.append(np.array(test['dt_of_fr']).reshape(-1,1), X, axis = 1) # X를 받아서 앞에 dt_of_fr열을 추가합니다.
    X = pd.DataFrame(X) # 차후 열 이름을 기준으로 사용해야 하기 때문에 Pandas로 변환합니다. 
    X.columns = test[['dt_of_fr']+eg_var].keys() # 열 이름을 추가합니다. eg_var 대상으로 진행할 것이기 때문에 이렇게 씁니다.

    train_rst = X.assign(
        fr_month_gas  = 'gas'+X['dt_of_fr'].str.slice_replace(start=7, stop=20, repl='').str.replace(pat='-', repl='', regex=False)
        ).assign(
            fr_month_ele  = 'ele'+X['dt_of_fr'].str.slice_replace(start=7, stop=20, repl='').str.replace(pat='-', repl='', regex=False)
            ) # gas/ele + 불이 난 달을 값으로 갖는 열을 생성합니다. 이걸 생성하지 않고 하는 것도 할 수 있을거 같은데 지금은 뇌가 멈췄으니 일단 만듭니다.
    
    tk = train_rst.keys()[np.where(pd.Series(train_rst.keys()) == index[0])[0][0]:np.where(pd.Series(train_rst.keys()) == index[-1])[0][0]+1]
    keys = pd.Series(tk).str.split('_', expand = True).iloc[:,[0,3]]
    keys[1] = keys[0]+keys[3]
    keys[1][0] = 'dt_of_fr' # 변경된 열 이름의 리스트입니다. 가장 앞의 값이 누락되어서 추가해주었습니다.
    
    coln = keys[1].ravel().tolist()+['fr_month_gas','fr_month_ele'] #열 이름을 바꿔줄 준비. 앞선 fr_month_gas/ ele열을 index로 사용하기 위함입니다
    train_rst.columns = coln #열 이름 변경합니다.

    fm_gas_index = train_rst.fr_month_gas.map(lambda x: np.where(x == train_rst.keys())[0][0]) # 불이 난 월의 gas의 열 index
    fm_ele_index = train_rst.fr_month_ele.map(lambda x: np.where(x == train_rst.keys())[0][0]) # 불이 난 월의 ele의 열 index

    train_eg = train_rst.assign(
        gas_fm = np.array(train_rst)[np.arange(test.shape[0]),fm_gas_index.values] # 최종적으로 두 index를 기준으로 값을 대입합니다.
        ).assign(
            ele_fm = np.array(train_rst)[np.arange(test.shape[0]),fm_ele_index.values]
            )[['gas_fm','ele_fm']] # 이 두 열만 남기거나
            #.drop(['fr_month_gas','fr_month_ele','dt_of_fr'], axis = 1) # 이 두 열을 더하거나. 일단은 두 열만 남겼습니다. 
    
    return np.array(train_eg)

In [112]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# 왜인지는 모르겠는데 full_pipeline을 만들 때 ColumnTransformer는 자꾸 에러가 나서 FeatureUnion을 쓰려고 DataFramseSelector를 정의해 주었습니다.

In [113]:
eg_imputer = Pipeline([
                       ('selector', DataFrameSelector(eg_var)),
                       ('zero_imputer', SimpleImputer(strategy='constant',fill_value=0)),
                       ('scaler', StandardScaler()),
                       ('fire month gas ele adder', TrFmGEAdder())
])

lw_imputer = Pipeline([
                      ('selector', DataFrameSelector(lw_var)),
                      ('zero_imputer', SimpleImputer(strategy='constant', fill_value=0)),
                      ('scaler', StandardScaler())
])

egl_pipe = FeatureUnion(transformer_list = [
                                            ('eg_imputer', eg_imputer),
                                            ('lw_imputer', lw_imputer)
                     ])

In [114]:
full_pipe = FeatureUnion(transformer_list = [
    ('weather',weather_pipe ),
    ('land',land_ohe),
    ('eg_imputer', eg_imputer),
    ('lw_impute', lw_imputer)
    
])

In [117]:
full_pipe.fit_transform(train_2)

TypeError: no supported conversion for types: (dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'))