In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import lightgbm as lgb

import gc
import os

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

gc.enable()

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, Flatten, Input, Concatenate, Embedding, Reshape
from keras.layers import LeakyReLU
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.layers.advanced_activations import PReLU

from numpy.random import seed
from tensorflow import set_random_seed

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from scipy import sparse

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_train = pd.read_csv('./data/PJT002_train.csv')
df_valid = pd.read_csv('./data/PJT002_validation.csv')
df_test = pd.read_csv('./data/PJT002_test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# dt_of_athrztn 데이터 오타 정정

athrztn_error_idx = ['26351', '21218', '30954', '12642', '38448', '53193', '32635', '19165', '19172', '46872']
athrztn_error_fix = ['19990624', '19900712', '19820930', '19800808', '19780421', '19971215', '20020227', '18900101', '18900101', '19940000']
athrztn_error_value = df_train['dt_of_athrztn'].iloc[athrztn_error_idx].values
print(athrztn_error_value)
df_train['dt_of_athrztn'].replace(athrztn_error_value, athrztn_error_fix, inplace=True)
df_train['dt_of_athrztn'].iloc[athrztn_error_idx]

[9990624.0 9900712.0 '9820930.0' 9800808.0 9780421.0 971215.0 '20022 27'
 1890010.0 1890010.0 1994000.0]


26351    19990624
21218    19900712
30954    19820930
12642    19800808
38448    19780421
53193    19971215
32635    20020227
19165    18900101
19172    18900101
46872    19940000
Name: dt_of_athrztn, dtype: object

In [4]:
df_test['fr_yn'] = np.nan
df_all = pd.concat([df_train, df_valid, df_test], axis=0)
df_all.reset_index(drop=True, inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [5]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(59199, 180)
(6898, 180)
(2957, 180)


In [6]:
np.unique(df_train['fr_yn'])

array(['N', 'Y'], dtype=object)

## 3.1. column 분류

In [7]:
columns = df_all.columns
len(columns)

180

In [8]:
# 시간 변수

time_columns = ['dt_of_fr', 'dt_of_athrztn']

# 이진 변수

binary_columns = ['fr_yn', 'mlt_us_yn', 'fr_fghtng_fclt_spcl_css_5_yn', 'fr_fghtng_fclt_spcl_css_6_yn', 'dngrs_thng_yn', 'slf_fr_brgd_yn',
           'blk_dngrs_thng_mnfctr_yn', 'cltrl_hrtg_yn']

# 범주형 변수

category_columns = ['bldng_us', 'bldng_archtctr', 'bldng_us_clssfctn', 'jmk', 'rgnl_ar_nm', 'rgnl_ar_nm2', 'lnd_us_sttn_nm', 'rd_sd_nm',
                    'emd_nm', 'trgt_crtr', 'us_yn']

# 정수형 변수

integer_columns = ['bldng_cnt', 'ttl_grnd_flr', 'ttl_dwn_flr', 'wnd_drctn', 'hmdt', 'hm_cnt', 'fr_sttn_dstnc', 'bldng_ar_prc',
                   'fr_wthr_fclt_dstnc', 'fr_mn_cnt', 'cctv_dstnc', 'fr_wthr_fclt_in_100m', 'cctv_in_100m', 'tbc_rtl_str_dstnc',
                   'sft_emrgnc_bll_dstnc', 'ahsm_dstnc', 'no_tbc_zn_dstnc', 'bldng_cnt_in_50m']

# 소수형 변수

floats_columns = ['bldng_ar', 'ttl_ar', 'lnd_ar', 'tmprtr', 'prcpttn', 'wnd_spd']

gas_ele_columns = ['gas_engry_us_201401', 'ele_engry_us_201401', 'gas_engry_us_201402', 'ele_engry_us_201402',
                   'gas_engry_us_201403', 'ele_engry_us_201403', 'gas_engry_us_201404', 'ele_engry_us_201404',
                   'gas_engry_us_201405', 'ele_engry_us_201405', 'gas_engry_us_201406', 'ele_engry_us_201406',
                   'gas_engry_us_201407', 'ele_engry_us_201407', 'gas_engry_us_201408', 'ele_engry_us_201408',
                   'gas_engry_us_201409', 'ele_engry_us_201409', 'gas_engry_us_201410', 'ele_engry_us_201410',
                   'gas_engry_us_201411', 'ele_engry_us_201411', 'gas_engry_us_201412', 'ele_engry_us_201412',
                   'gas_engry_us_201501', 'ele_engry_us_201501', 'gas_engry_us_201502', 'ele_engry_us_201502',
                   'gas_engry_us_201503', 'ele_engry_us_201503', 'gas_engry_us_201504', 'ele_engry_us_201504',
                   'gas_engry_us_201505', 'ele_engry_us_201505', 'gas_engry_us_201506', 'ele_engry_us_201506',
                   'gas_engry_us_201507', 'ele_engry_us_201507', 'gas_engry_us_201508', 'ele_engry_us_201508',
                   'gas_engry_us_201509', 'ele_engry_us_201509', 'gas_engry_us_201510', 'ele_engry_us_201510',
                   'gas_engry_us_201511', 'ele_engry_us_201511', 'gas_engry_us_201512', 'ele_engry_us_201512',
                   'gas_engry_us_201601', 'ele_engry_us_201601', 'gas_engry_us_201602', 'ele_engry_us_201602',
                   'gas_engry_us_201603', 'ele_engry_us_201603', 'gas_engry_us_201604', 'ele_engry_us_201604',
                   'gas_engry_us_201605', 'ele_engry_us_201605', 'gas_engry_us_201606', 'ele_engry_us_201606',
                   'gas_engry_us_201607', 'ele_engry_us_201607', 'gas_engry_us_201608', 'ele_engry_us_201608',
                   'gas_engry_us_201609', 'ele_engry_us_201609', 'gas_engry_us_201610', 'ele_engry_us_201610',
                   'gas_engry_us_201611', 'ele_engry_us_201611', 'gas_engry_us_201612', 'ele_engry_us_201612',
                   'gas_engry_us_201701', 'ele_engry_us_201701', 'gas_engry_us_201702', 'ele_engry_us_201702',
                   'gas_engry_us_201703', 'ele_engry_us_201703', 'gas_engry_us_201704', 'ele_engry_us_201704',
                   'gas_engry_us_201705', 'ele_engry_us_201705', 'gas_engry_us_201706', 'ele_engry_us_201706',
                   'gas_engry_us_201707', 'ele_engry_us_201707', 'gas_engry_us_201708', 'ele_engry_us_201708',
                   'gas_engry_us_201709', 'ele_engry_us_201709', 'gas_engry_us_201710', 'ele_engry_us_201710',
                   'gas_engry_us_201711', 'ele_engry_us_201711', 'gas_engry_us_201712', 'ele_engry_us_201712',
                   'gas_engry_us_201801', 'ele_engry_us_201801', 'gas_engry_us_201802', 'ele_engry_us_201802',
                   'gas_engry_us_201803', 'ele_engry_us_201803', 'gas_engry_us_201804', 'ele_engry_us_201804',
                   'gas_engry_us_201805', 'ele_engry_us_201805', 'gas_engry_us_201806', 'ele_engry_us_201806',
                   'gas_engry_us_201807', 'ele_engry_us_201807', 'gas_engry_us_201808', 'ele_engry_us_201808',
                   'gas_engry_us_201809', 'ele_engry_us_201809', 'gas_engry_us_201810', 'ele_engry_us_201810',
                   'gas_engry_us_201811', 'ele_engry_us_201811', 'gas_engry_us_201812', 'ele_engry_us_201812']

lw_columns = ['lw_13101010', 'lw_13101110', 'lw_13101210', 'lw_13101211', 'lw_13101310', 'lw_13101410', 'lw_13111010',
              'lw_13111110', 'lw_13121010', 'lw_13121011', 'lw_13131010', 'lw_13131110', 'lw_13141010', 'lw_13141011']

In [9]:
useless_columns = ['id']

In [10]:
len(time_columns + binary_columns + category_columns + integer_columns + floats_columns + gas_ele_columns + lw_columns + useless_columns)

180

In [11]:
df_all.drop(useless_columns, axis=1, inplace=True)

## 3.2 데이터 타입 정정 &결측값 대체 

### 3.2.1 binary

In [12]:
binary_y = {'N': 0, 'Y': 1}

for col in binary_columns:
    df_all[col] = df_all[col].map(binary_y)

In [13]:
df_all[binary_columns]

Unnamed: 0,fr_yn,mlt_us_yn,fr_fghtng_fclt_spcl_css_5_yn,fr_fghtng_fclt_spcl_css_6_yn,dngrs_thng_yn,slf_fr_brgd_yn,blk_dngrs_thng_mnfctr_yn,cltrl_hrtg_yn
0,1.0,0,,,,,,
1,0.0,0,,,,,,
2,1.0,0,,,,,,
3,0.0,0,,,,,,
4,0.0,0,,,,,,
...,...,...,...,...,...,...,...,...
69049,,0,,,,,,
69050,,0,,,,,,
69051,,0,,,,,,
69052,,0,,,,,,


In [14]:
df_all[binary_columns].dtypes

fr_yn                           float64
mlt_us_yn                         int64
fr_fghtng_fclt_spcl_css_5_yn    float64
fr_fghtng_fclt_spcl_css_6_yn    float64
dngrs_thng_yn                   float64
slf_fr_brgd_yn                  float64
blk_dngrs_thng_mnfctr_yn        float64
cltrl_hrtg_yn                   float64
dtype: object

In [15]:
# 결측값 대체

df_all.fillna(-1, inplace=True)

In [16]:
df_all[binary_columns] = df_all[binary_columns].astype(np.int64)

In [17]:
df_all[binary_columns].dtypes

fr_yn                           int64
mlt_us_yn                       int64
fr_fghtng_fclt_spcl_css_5_yn    int64
fr_fghtng_fclt_spcl_css_6_yn    int64
dngrs_thng_yn                   int64
slf_fr_brgd_yn                  int64
blk_dngrs_thng_mnfctr_yn        int64
cltrl_hrtg_yn                   int64
dtype: object

### 3.2.2. category

In [18]:
df_all[category_columns].dtypes

bldng_us             object
bldng_archtctr       object
bldng_us_clssfctn    object
jmk                  object
rgnl_ar_nm           object
rgnl_ar_nm2          object
lnd_us_sttn_nm       object
rd_sd_nm             object
emd_nm               object
trgt_crtr            object
us_yn                object
dtype: object

In [19]:
"""
# 범주형 변수 label encoding

features = []

for col in category_columns:
    df_all[col], _ = df_all[col].factorize(na_sentinel=-1)
"""

'\n# 범주형 변수 label encoding\n\nfeatures = []\n\nfor col in category_columns:\n    df_all[col], _ = df_all[col].factorize(na_sentinel=-1)\n'

In [20]:
df_all[category_columns] = df_all[category_columns].astype('category')

In [21]:
df_all[category_columns].dtypes

bldng_us             category
bldng_archtctr       category
bldng_us_clssfctn    category
jmk                  category
rgnl_ar_nm           category
rgnl_ar_nm2          category
lnd_us_sttn_nm       category
rd_sd_nm             category
emd_nm               category
trgt_crtr            category
us_yn                category
dtype: object

### 3.2.3 integer

In [22]:
df_all[integer_columns].dtypes

bldng_cnt                 int64
ttl_grnd_flr            float64
ttl_dwn_flr             float64
wnd_drctn               float64
hmdt                    float64
hm_cnt                  float64
fr_sttn_dstnc             int64
bldng_ar_prc            float64
fr_wthr_fclt_dstnc        int64
fr_mn_cnt               float64
cctv_dstnc                int64
fr_wthr_fclt_in_100m      int64
cctv_in_100m              int64
tbc_rtl_str_dstnc         int64
sft_emrgnc_bll_dstnc      int64
ahsm_dstnc                int64
no_tbc_zn_dstnc           int64
bldng_cnt_in_50m          int64
dtype: object

In [23]:
df_all[integer_columns] = df_all[integer_columns].astype(np.int64)

In [24]:
df_all[integer_columns].dtypes

bldng_cnt               int64
ttl_grnd_flr            int64
ttl_dwn_flr             int64
wnd_drctn               int64
hmdt                    int64
hm_cnt                  int64
fr_sttn_dstnc           int64
bldng_ar_prc            int64
fr_wthr_fclt_dstnc      int64
fr_mn_cnt               int64
cctv_dstnc              int64
fr_wthr_fclt_in_100m    int64
cctv_in_100m            int64
tbc_rtl_str_dstnc       int64
sft_emrgnc_bll_dstnc    int64
ahsm_dstnc              int64
no_tbc_zn_dstnc         int64
bldng_cnt_in_50m        int64
dtype: object

### 3.2.4 floats

In [25]:
df_all[floats_columns].dtypes

bldng_ar    float64
ttl_ar      float64
lnd_ar      float64
tmprtr      float64
prcpttn     float64
wnd_spd     float64
dtype: object

In [26]:
df_all[time_columns].dtypes

dt_of_fr         object
dt_of_athrztn    object
dtype: object

# 4. 피처 엔지니어링

## 4.1. 날짜 데이터 처리

In [27]:
dt_of_athrztn_list = []
for date in df_all['dt_of_athrztn']:
    date = int(str(date)[:4])
    dt_of_athrztn_list.append(date)

In [28]:
fe_datetime = pd.concat([pd.Series(pd.to_datetime(df_all['dt_of_fr']).map(lambda x: x.year)),
                         pd.Series(pd.to_datetime(df_all['dt_of_fr']).map(lambda x: x.month)),
                         pd.Series(pd.to_datetime(df_all['dt_of_fr']).map(lambda x: x.day)),
                         pd.Series(pd.to_datetime(df_all['dt_of_fr']).map(lambda x: x.hour))], axis=1)
fe_datetime.columns = ['dt_of_fr_year', 'dt_of_fr_month', 'dt_of_fr_day', 'dt_of_fr_hour']
fe_datetime.head()

Unnamed: 0,dt_of_fr_year,dt_of_fr_month,dt_of_fr_day,dt_of_fr_hour
0,2017,10,20,5
1,2018,9,30,8
2,2016,10,30,14
3,2016,6,14,5
4,2018,4,22,5


In [29]:
from_athrztn_to_fire_list = []

for n, fr_year in enumerate(fe_datetime['dt_of_fr_year']):
    if dt_of_athrztn_list[n] >= 0:
        from_athrztn_to_fire = fr_year - dt_of_athrztn_list[n]
        from_athrztn_to_fire_list.append(from_athrztn_to_fire)
    else:
        from_athrztn_to_fire_list.append(-1)

In [30]:
fe_datetime['from_athrztn_to_fr'] = from_athrztn_to_fire_list
fe_datetime.head()

Unnamed: 0,dt_of_fr_year,dt_of_fr_month,dt_of_fr_day,dt_of_fr_hour,from_athrztn_to_fr
0,2017,10,20,5,40
1,2018,9,30,8,-1
2,2016,10,30,14,16
3,2016,6,14,5,80
4,2018,4,22,5,-1


In [31]:
df_all.drop(['dt_of_fr', 'dt_of_athrztn'], axis=1, inplace=True)

In [32]:
df_all = pd.concat([df_all, fe_datetime], axis=1)

## 4.2. 범위가 큰 데이터의 log 변환

In [33]:
def log_scale(data, cols):
    for col in cols:
        log_list = []
        for num in data[col]:
            if num >= 1:
                log_list.append(round(np.log(num), 3))
            elif num <= 1 and num >= 0:
                log_list.append(0)
            else:
                log_list.append(-1)
        data[col] = log_list
    return data

In [34]:
log_columns = ['bldng_ar', 'ttl_ar', 'lnd_ar', 'hm_cnt', 'fr_sttn_dstnc', 'bldng_ar_prc', 'fr_wthr_fclt_dstnc', 'tbc_rtl_str_dstnc',
               'ahsm_dstnc', 'no_tbc_zn_dstnc']

In [35]:
df_all[log_columns]

Unnamed: 0,bldng_ar,ttl_ar,lnd_ar,hm_cnt,fr_sttn_dstnc,bldng_ar_prc,fr_wthr_fclt_dstnc,tbc_rtl_str_dstnc,ahsm_dstnc,no_tbc_zn_dstnc
0,69.42,69.42,0.0,17360,4547,-1,133,1891,11322,88
1,46.29,46.29,0.0,1791,6388,122581,489,4533,3369,7727
2,583.80,2516.76,1446.0,17285,3340,618105,143,277,12451,72
3,48.92,48.92,0.0,7327,179,719542,1585,438,407,508
4,0.00,0.00,0.0,17278,4822,-1,603,1702,12487,707
...,...,...,...,...,...,...,...,...,...,...
69049,122.45,329.55,288.0,79708,1150,1119709,56,321,16240,183
69050,52.89,52.89,0.0,42093,1472,2495746,370,903,3266,590
69051,166.05,466.82,277.3,84290,885,1146599,68,275,16485,105
69052,250.00,240.00,827.8,40703,1437,-1,97,159,20055,806


In [36]:
df_all = log_scale(df_all, log_columns)

## 4.3. gas_ele 데이터 처리

In [37]:
df_ele_gas = df_all[gas_ele_columns]
df_ele_gas.head()

Unnamed: 0,gas_engry_us_201401,ele_engry_us_201401,gas_engry_us_201402,ele_engry_us_201402,gas_engry_us_201403,ele_engry_us_201403,gas_engry_us_201404,ele_engry_us_201404,gas_engry_us_201405,ele_engry_us_201405,...,gas_engry_us_201808,ele_engry_us_201808,gas_engry_us_201809,ele_engry_us_201809,gas_engry_us_201810,ele_engry_us_201810,gas_engry_us_201811,ele_engry_us_201811,gas_engry_us_201812,ele_engry_us_201812
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### 4.3.1 gas_ele 총합

In [38]:
df_ele_gas['gas_ele_sum'] = df_ele_gas.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [39]:
ele_gas_log_list = []

for value in df_ele_gas['gas_ele_sum']:
    if value == 0.0:
        ele_gas_log_list.append(0)
    elif value < 0:
        ele_gas_log_list.append(-1)
    else:
        ele_gas_log_list.append(round(np.log(value),5))

In [40]:
df_all['gas_ele_sum'] = ele_gas_log_list

### 4.3.2 gas_ele 0 총합

In [41]:
df_all['gas_ele_zero_sum'] = (df_all[gas_ele_columns] == 0).sum(axis=1)

In [42]:
df_all['gas_ele_zero_sum'].replace(120, 121, inplace=True)

## 4.4. lw 데이터 처리

In [43]:
df_lw = df_all[lw_columns]
df_lw.head()

Unnamed: 0,lw_13101010,lw_13101110,lw_13101210,lw_13101211,lw_13101310,lw_13101410,lw_13111010,lw_13111110,lw_13121010,lw_13121011,lw_13131010,lw_13131110,lw_13141010,lw_13141011
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [44]:
df_lw['lw_sum'] = df_lw.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [45]:
lw_list = []

for value in df_lw['lw_sum']:
    if value < 0:
        lw_list.append(-1)
    else:
        lw_list.append(value)

In [46]:
df_all['lw_sum'] = lw_list

In [47]:
df_all.drop(lw_columns, axis=1, inplace=True)

## 4.6. 파생 변수 (결측값의 총합, 이진 변수의 합, Target Encoding)

### 4.6.1 결측값의 총합

In [48]:
df_all['missing'] = (df_all[binary_columns + category_columns + integer_columns + floats_columns + ['lw_sum', 'gas_ele_sum', 'from_athrztn_to_fr']] == -1).sum(axis=1).astype(float)

# 5. 데이터 저장

In [49]:
df_all.columns

Index(['ahsm_dstnc', 'bldng_ar', 'bldng_ar_prc', 'bldng_archtctr', 'bldng_cnt',
       'bldng_cnt_in_50m', 'bldng_us', 'bldng_us_clssfctn',
       'blk_dngrs_thng_mnfctr_yn', 'cctv_dstnc',
       ...
       'wnd_spd', 'dt_of_fr_year', 'dt_of_fr_month', 'dt_of_fr_day',
       'dt_of_fr_hour', 'from_athrztn_to_fr', 'gas_ele_sum',
       'gas_ele_zero_sum', 'lw_sum', 'missing'],
      dtype='object', length=172)

In [50]:
features = ['ahsm_dstnc', 'bldng_ar', 'bldng_ar_prc', 'bldng_archtctr', 'bldng_cnt',
           'bldng_cnt_in_50m', 'bldng_us', 'bldng_us_clssfctn',
           'blk_dngrs_thng_mnfctr_yn', 'cctv_dstnc', 'cctv_in_100m',
           'cltrl_hrtg_yn', 'dngrs_thng_yn', 'emd_nm',
           'fr_fghtng_fclt_spcl_css_5_yn', 'fr_fghtng_fclt_spcl_css_6_yn',
           'fr_mn_cnt', 'fr_sttn_dstnc', 'fr_wthr_fclt_dstnc',
           'fr_wthr_fclt_in_100m', 'fr_yn', 'hm_cnt', 'hmdt', 'jmk', 'lnd_ar',
           'lnd_us_sttn_nm', 'mlt_us_yn', 'no_tbc_zn_dstnc', 'prcpttn', 'rd_sd_nm',
           'rgnl_ar_nm', 'rgnl_ar_nm2', 'sft_emrgnc_bll_dstnc', 'slf_fr_brgd_yn',
           'tbc_rtl_str_dstnc', 'tmprtr', 'trgt_crtr', 'ttl_ar',
           'ttl_grnd_flr', 'ttl_dwn_flr', 'us_yn', 'wnd_drctn', 'wnd_spd', 'dt_of_fr_year',
           'dt_of_fr_month', 'dt_of_fr_day', 'dt_of_fr_hour', 'from_athrztn_to_fr',
           'gas_ele_zero_sum', 'lw_sum', 'missing', 'gas_ele_sum']

In [51]:
df_all = df_all[features]

In [52]:
# 범주형 변수 label encoding

for col in category_columns:
    df_all[col], _ = df_all[col].factorize(na_sentinel=-1)

In [53]:
train_len = len(df_train)
valid_len = train_len + len(df_valid)
test_len = valid_len + len(df_test)

df_train = df_all.iloc[:train_len]
df_valid = df_all.iloc[train_len:valid_len]
df_test = df_all.iloc[valid_len:test_len]

# LightGBM

In [54]:
def plot_feature_importance(model, X_train, figsize=(12, 6)):
    sns.set_style('darkgrid')
    
    # Plot feature importance
    feature_importance = model.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    plt.figure(figsize=figsize)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X_train.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

def f1_score_optimizer(model, X_test, y_test):
    scores = []
    thds = []
    for thd in tqdm(np.linspace(0.35, 0.75, 41)):
        y_pred_prob = model.predict_proba(X_test)
        y_pred = [0 if i >= thd else 1 for i in y_pred_prob[:,0]]
        scores.append(f1_score(y_test, y_pred))
        thds.append(thd)
    print(thds[scores.index(max(scores))], max(scores))
    print()
    return thds[scores.index(max(scores))], max(scores)

def f1_score_optimizer_2(y_valid, y_prob):
    scores = []
    thds = []
    for thd in tqdm(np.linspace(0.35, 0.75, 41)):
        y_pred = [0 if i >= thd else 1 for i in y_prob]
        scores.append(f1_score(y_valid, y_pred))
        thds.append(thd)
    print(thds[scores.index(max(scores))], max(scores))
    print()
    return thds[scores.index(max(scores))], max(scores)

def fit_lgb(X_fit, y_fit, X_val, y_val):
    
    model = lgb.LGBMClassifier(**{'learning_rate': 0.02,
                                'num_leaves': 31,
                                'max_bin': 1023,
                                'min_child_samples': 1000,
                                'reg_alpha': 0.1,
                                'reg_lambda': 0.2,
                                'feature_fraction': 1.0,
                                'bagging_freq': 1,
                                'bagging_fraction': 0.85,
                                'objective': 'binary',
                                'n_jobs': -1,
                                'n_estimators':500,
                                'tree_learner': 'serial',})
     
    model = model.fit( X_fit, y_fit,
                      eval_set = (X_val, y_val),
                      verbose = 20,
                      eval_metric='f1_score',
                      early_stopping_rounds=25,
                      #categorical_feature = categorical_featuer_num_fix
                        )
    LB_MODELS.append(model)
    """
    y_pred = model.predict(X_valid)
    
    lb_f1_eval_list.append(f1_score(y_valid, y_pred))
    
    lg_thd, lg_score = f1_score_optimizer(model, X_valid, y_valid)
    
    lg_thds.append(lg_thd)
    lg_scores.append(lg_score)
    """
    cv_val = model.predict_proba(X_test)[:,0]
    
    return cv_val

In [55]:
lg_thds, lg_scores = [], []
lb_f1_eval_list = []
LB_MODELS = []

category_columns = ['bldng_us', 'bldng_archtctr', 'bldng_us_clssfctn', 'jmk', 'rgnl_ar_nm', 'rgnl_ar_nm2', 'lnd_us_sttn_nm', 'rd_sd_nm',
                    'emd_nm', 'trgt_crtr', 'us_yn']

df_train[category_columns] = df_train[category_columns].astype('category')
df_valid[category_columns] = df_valid[category_columns].astype('category')
df_test[category_columns] = df_test[category_columns].astype('category')

X_train = df_train.drop(['fr_yn'], axis=1)
y_train = df_train['fr_yn']
X_valid = df_valid.drop(['fr_yn'], axis=1)
y_valid = df_valid['fr_yn']

#X_train = pd.concat([df_train, df_valid], axis=0).drop(['fr_yn'], axis=1)
#y_train = pd.concat([df_train, df_valid], axis=0)['fr_yn']

X_test = df_test.drop(['fr_yn'], axis=1)
y_test = df_test['fr_yn']

X = X_train
y = y_train

X_train[category_columns] = X_train[category_columns].astype('category')

lgb_cv_result = np.zeros(df_test.shape[0])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)

print('\nModel Fitting...')
for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(X, y)):
    print('Fold:', fold_ )
    X_fit, y_fit = X.iloc[train_indexes, :], y[train_indexes]
    X_val, y_val = X.iloc[valid_indexes, :], y[valid_indexes]
    
    """
    for feature in tqdm(te_features):
        map_dic = pd.DataFrame([X_fit[feature], y]).T.groupby(feature).agg('mean')
        map_dic = map_dic.to_dict()['fr_yn']
        X_fit[feature + '_target_enc'] = X_fit[feature].apply(lambda x: map_dic.get(x, 0)).astype(np.float64)
        X_val[feature + '_target_enc'] = X_val[feature].apply(lambda x: map_dic.get(x, 0)).astype(np.float64)
    
        X_valid[feature + '_target_enc'] = X_valid[feature].apply(lambda x: map_dic.get(x, 0)).astype(np.float64)
        X_test[feature + '_target_enc'] = X_test[feature].apply(lambda x: map_dic.get(x, 0)).astype(np.float64)
    """
    
    print('LigthGBM')
    lgb_cv_result += fit_lgb(X_fit, y_fit, X_val, y_val)
    
    """
    print('CatBoost')
    cb_cv_result += fit_cb(X_fit, y_fit, X_val, y_val)
    """
    del X_fit, X_val, y_fit, y_val
    gc.collect()

lgb_cv_result /= 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]



Model Fitting...
Fold: 0
LigthGBM
Training until validation scores don't improve for 25 rounds.
[20]	valid_0's binary_logloss: 0.316198
[40]	valid_0's binary_logloss: 0.282428
[60]	valid_0's binary_logloss: 0.263648
[80]	valid_0's binary_logloss: 0.252656
[100]	valid_0's binary_logloss: 0.245893
[120]	valid_0's binary_logloss: 0.241616
[140]	valid_0's binary_logloss: 0.238828
[160]	valid_0's binary_logloss: 0.236969
[180]	valid_0's binary_logloss: 0.235797
[200]	valid_0's binary_logloss: 0.234932
[220]	valid_0's binary_logloss: 0.234318
[240]	valid_0's binary_logloss: 0.233896
[260]	valid_0's binary_logloss: 0.233584
[280]	valid_0's binary_logloss: 0.233431
[300]	valid_0's binary_logloss: 0.233359
[320]	valid_0's binary_logloss: 0.233192
[340]	valid_0's binary_logloss: 0.233094
[360]	valid_0's binary_logloss: 0.233073
[380]	valid_0's binary_logloss: 0.233103
Early stopping, best iteration is:
[356]	valid_0's binary_logloss: 0.233034
Fold: 1
LigthGBM
Training until validation scores do

# NN

In [56]:
def nn_model():
    inputs = []
    embeddings = []
    flatten_layers = []

    inp_bldng_us = Input(shape=(1,))
    embedding = Embedding(34, 16, input_length=1)(inp_bldng_us)
    embedding = Reshape(target_shape=(16,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_bldng_us)
    embeddings.append(embedding)

    inp_bldng_archtctr = Input(shape=(1,))
    embedding = Embedding(18, 8, input_length=1)(inp_bldng_archtctr)
    embedding = Reshape(target_shape=(8,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_bldng_archtctr)
    embeddings.append(embedding)

    inp_bldng_us_clssfctn = Input(shape=(1,))
    embedding = Embedding(7, 4, input_length=1)(inp_bldng_us_clssfctn)
    embedding = Reshape(target_shape=(4,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_bldng_us_clssfctn)
    embeddings.append(embedding)

    inp_jmk = Input(shape=(1,))
    embedding = Embedding(25, 12, input_length=1)(inp_jmk)
    embedding = Reshape(target_shape=(12,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_jmk)
    embeddings.append(embedding)

    inp_rgnl_ar_nm = Input(shape=(1,))
    embedding = Embedding(23, 10, input_length=1)(inp_rgnl_ar_nm)
    embedding = Reshape(target_shape=(10,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_rgnl_ar_nm)
    embeddings.append(embedding)

    inp_rgnl_ar_nm2 = Input(shape=(1,))
    embedding = Embedding(19, 8, input_length=1)(inp_rgnl_ar_nm2)
    embedding = Reshape(target_shape=(8,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_rgnl_ar_nm2)
    embeddings.append(embedding)

    inp_lnd_us_sttn_nm = Input(shape=(1,))
    embedding = Embedding(45, 24, input_length=1)(inp_lnd_us_sttn_nm)
    embedding = Reshape(target_shape=(24,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_lnd_us_sttn_nm)
    embeddings.append(embedding)

    inp_rd_sd_nm = Input(shape=(1,))
    embedding = Embedding(13, 6, input_length=1)(inp_rd_sd_nm)
    embedding = Reshape(target_shape=(6,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_rd_sd_nm)
    embeddings.append(embedding)

    inp_emd_nm = Input(shape=(1,))
    embedding = Embedding(371, 150, input_length=1)(inp_emd_nm)
    embedding = Reshape(target_shape=(150,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_emd_nm)
    embeddings.append(embedding)

    inp_trgt_crtr = Input(shape=(1,))
    embedding = Embedding(17, 8, input_length=1)(inp_trgt_crtr)
    embedding = Reshape(target_shape=(8,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_trgt_crtr)
    embeddings.append(embedding)

    inp_us_yn = Input(shape=(1,))
    embedding = Embedding(3, 2, input_length=1)(inp_us_yn)
    embedding = Reshape(target_shape=(2,))(embedding)
    embedding = Dropout(0.25)(embedding)
    inputs.append(inp_us_yn)
    embeddings.append(embedding)

    input_numeric = Input(shape=(X_train.shape[1], ))
    embedding_numeric = Dense(51)(input_numeric)
    embeddings.append(embedding_numeric)
    inputs.append(input_numeric)

    
    """
    print('embedding')
    for e, c in enumerate(cat_fea):
        input_c = Input(shape=(1, ), dtype='int32')
        num_c = max_cat_values[e]
        embed_c = Embedding(num_c, 6, input_length=1)(input_c)
        embed_c = Dropout(0.25)(embed_c)
        flatten_c = Flatten()(embed_c)
        
        inputs.append(input_c)
        flatten_layers.append(flatten_c)
    
    input_num = Input(shape=(X_train.shape[1], ), dtype='float32')
    flatten_layers.append(input_num)
    inputs.append(input_num)
    """
    print('flatten')
    flatten = Concatenate()(embeddings)
    
    fc1 = Dense(512, init='he_normal')(flatten)
    fc1 = PReLU()(fc1)
    fc1 = BatchNormalization()(fc1)
    fc1 = Dropout(0.75)(fc1)
    
    fc1 = Dense(64, init='he_normal')(fc1)
    fc1 = PReLU()(fc1)
    fc1 = BatchNormalization()(fc1)
    fc1 = Dropout(0.5)(fc1)
    
    outputs = Dense(1, init='he_normal', activation='sigmoid')(fc1)
    print('model')
    model = Model(input = inputs, output = outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return (model)

In [57]:
y = df_all['fr_yn']
df_all.drop('fr_yn', axis=1, inplace=True)

cat_fea = ['bldng_us', 'bldng_archtctr', 'bldng_us_clssfctn', 'jmk', 'rgnl_ar_nm', 'rgnl_ar_nm2', 'lnd_us_sttn_nm', 'rd_sd_nm',
                        'emd_nm', 'trgt_crtr', 'us_yn']
num_fea = []
for col in df_all.columns:
    if col not in cat_fea:
        num_fea.append(col)

train_cat = df_all.iloc[:train_len][cat_fea]
train_num = df_all.iloc[:train_len][num_fea]
y_train = y[:train_len]

valid_cat = df_all.iloc[train_len:valid_len][cat_fea]
valid_num = df_all.iloc[train_len:valid_len][num_fea]
y_valid = y[train_len:valid_len]

test_cat = df_all.iloc[valid_len:test_len][cat_fea]
test_num = df_all.iloc[valid_len:test_len][num_fea]
y_test = y[valid_len:]


max_cat_values = []
for col in cat_fea:
    max_cat_values.append(np.max(df_all[col]))

train_list = [train_num, train_cat]
valid_list = [valid_num, valid_cat]
test_list = [test_num, test_cat]


X_train = sparse.hstack(train_list).tocsr()
X_valid = sparse.hstack(valid_list).tocsr()
X_test = sparse.hstack(test_list).tocsr()

all_data = np.vstack([X_train.toarray(), X_valid.toarray(), X_test.toarray()])

scaler = StandardScaler()
scaler.fit(all_data)

X_train = scaler.transform(X_train.toarray())
X_valid = scaler.transform(X_valid.toarray())
X_test = scaler.transform(X_test.toarray())


X_train_cat = train_cat.as_matrix()
X_valid_cat = valid_cat.as_matrix()
X_test_cat = test_cat.as_matrix()

x_valid_cat = []
for i in range(X_valid_cat.shape[1]):
    x_valid_cat.append(X_valid_cat[:, i].reshape(-1, 1))
x_valid_cat.append(X_valid)

x_test_cat = []
for i in range(X_test_cat.shape[1]):
    x_test_cat.append(X_test_cat[:, i].reshape(-1, 1))
x_test_cat.append(X_test)


xtr = X_train
ytr = y_train

xvl = X_valid
yvl = y_valid

xts = X_test
yts = y_test


xtr_cat = X_train_cat
xvl_cat = X_valid_cat
xts_cat = X_test_cat

# 범주형 데이터를 추출하여, 수치형 데이터와 통합한다
xtr_cat_list, xvl_cat_list, xts_cat_list = [], [], []
for i in range(xtr_cat.shape[1]):
    xtr_cat_list.append(xtr_cat[:, i].reshape(-1, 1))
    xvl_cat_list.append(xvl_cat[:, i].reshape(-1, 1))
for i in range(xts_cat.shape[1]):
    xts_cat_list.append(xts_cat[:, i].reshape(-1, 1))
xtr_cat_list.append(xtr)
xvl_cat_list.append(xvl)
xts_cat_list.append(xts)



In [58]:
# 인공 신경망 모델을 정의한다
model = nn_model()

# 모델을 학습한다
early_stopping = EarlyStopping(monitor='val_acc', patience=5, verbose=1, mode='auto')
model.fit(xtr_cat_list, ytr, epochs=30, batch_size=512, verbose=2, validation_data=[xvl_cat_list, yvl], callbacks=[early_stopping])

flatten




model




Train on 59199 samples, validate on 6898 samples
Epoch 1/30
 - 4s - loss: 0.6524 - acc: 0.7105 - val_loss: 0.5866 - val_acc: 0.7296
Epoch 2/30
 - 1s - loss: 0.4119 - acc: 0.8447 - val_loss: 0.5360 - val_acc: 0.7562
Epoch 3/30
 - 1s - loss: 0.3308 - acc: 0.8711 - val_loss: 0.4252 - val_acc: 0.8197
Epoch 4/30
 - 1s - loss: 0.3034 - acc: 0.8776 - val_loss: 0.4140 - val_acc: 0.8226
Epoch 5/30
 - 1s - loss: 0.2794 - acc: 0.8818 - val_loss: 0.4016 - val_acc: 0.8295
Epoch 6/30
 - 1s - loss: 0.2671 - acc: 0.8850 - val_loss: 0.3859 - val_acc: 0.8443
Epoch 7/30
 - 1s - loss: 0.2567 - acc: 0.8885 - val_loss: 0.3851 - val_acc: 0.8449
Epoch 8/30
 - 1s - loss: 0.2533 - acc: 0.8877 - val_loss: 0.3808 - val_acc: 0.8471
Epoch 9/30
 - 1s - loss: 0.2474 - acc: 0.8893 - val_loss: 0.3825 - val_acc: 0.8407
Epoch 10/30
 - 1s - loss: 0.2445 - acc: 0.8893 - val_loss: 0.3780 - val_acc: 0.8463
Epoch 11/30
 - 1s - loss: 0.2413 - acc: 0.8906 - val_loss: 0.3871 - val_acc: 0.8401
Epoch 12/30
 - 1s - loss: 0.2386 - a

<keras.callbacks.History at 0x1f281abf908>