In [0]:
# 라이브러리를 불러온다
import xgboost as xgb
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd


In [0]:
# XGBoost 모델 설정값 지정
eta = 0.1
max_depth = 6
subsample = 0.9
colsample_bytree = 0.85
min_child_weight = 55
num_boost_round = 500

params = {"objective": "reg:linear",
          "booster": "gbtree",
          "eta": eta,
          "max_depth": int(max_depth),
          "subsample": subsample,
          "colsample_bytree": colsample_bytree,
          "min_child_weight": min_child_weight,
          "silent": 1
          }



In [0]:
# 훈련 데이터, 테스트 데이터 불러와서 하나로 통합한다
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Porto Seguro/train.csv')
train_label = train['target']
train_id = train['id']
del train['target'], train['id']

test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Porto Seguro/test.csv')
test_id = test['id']
del test['id']

data = train.append(test)
data.reset_index(inplace=True)
train_rows = train.shape[0]

#

In [9]:
# 파생 변수를 생성한다
feature_results = []

for target_g in ['car', 'ind', 'reg']:
    # target_g는 예측 대상 (target_list)로 사용하고, 그 외 대분류는 학습 변수(features)로 사용한다
    features = [x for x in list(data) if target_g not in x]
    target_list = [x for x in list(data) if target_g in x]
    train_fea = np.array(data[features])
    for target in target_list:
        print(target)
        train_label = data[target]
        # 데이터를 5개로 분리하여, 모든 데이터에 대한 예측값을 계산한다
        kfold = KFold(n_splits=5, random_state=218, shuffle=True)
        kf = kfold.split(data)
        cv_train = np.zeros(shape=(data.shape[0], 1))
        for i, (train_fold, validate) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
                train_fea[train_fold, :], train_fea[validate, :], train_label[train_fold], train_label[validate]
            dtrain = xgb.DMatrix(X_train, label_train)
            dvalid = xgb.DMatrix(X_validate, label_validate)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            # XGBoost 모델을 학습한다
            bst = xgb.train(params, dtrain, num_boost_round, evals=watchlist, verbose_eval=50, early_stopping_rounds=10)
            # 예측 결과물을 저장한다
            cv_train[validate, 0] += bst.predict(xgb.DMatrix(X_validate), ntree_limit=bst.best_ntree_limit)
        feature_results.append(cv_train)


ps_car_01_cat
[0]	train-rmse:7.44174	valid-rmse:7.4461
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[50]	train-rmse:2.30731	valid-rmse:2.31275
[100]	train-rmse:2.29693	valid-rmse:2.30478
[150]	train-rmse:2.29236	valid-rmse:2.30273
[200]	train-rmse:2.2884	valid-rmse:2.3015
[250]	train-rmse:2.28489	valid-rmse:2.30056
Stopping. Best iteration:
[251]	train-rmse:2.28482	valid-rmse:2.30053

[0]	train-rmse:7.44389	valid-rmse:7.44083
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[50]	train-rmse:2.30727	valid-rmse:2.31025
[100]	train-rmse:2.29723	valid-rmse:2.30266
[150]	train-rmse:2.29311	valid-rmse:2.30091
[200]	train-rmse:2.28891	valid-rmse:2.29953
[250]	train-rmse:2.28601	valid-rmse:2.29874
[300]	train-rmse:2.28295	valid-rmse:2.29814
Stopping. Best iteration:
[329]	train-rmse:2.28122	valid-rmse:

KeyboardInterrupt: ignored

In [11]:
# 예측 결과물을 훈련, 테스트 데이터로 분리한 후, pickle로 저장한다
feature_results = np.hstack(feature_results)
train_features = feature_results[:train_rows, :]
test_features = feature_results[train_rows:, :]

import pickle
pickle.dump([train_features, test_features], open('/content/drive/My Drive/Colab Notebooks/Porto Seguro/fea0.pk', 'wb'))

ValueError: ignored

In [12]:
# 인공 신경망 모델 keras 라이브러리 읽어오기
from keras.layers import Dense, Dropout, Embedding, Flatten, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.models import Model

# 시간 측정 및 압축파일을 읽어오기 위한 라이브러리
from time import time
import datetime
from itertools import combinations
import pickle

# 피쳐 엔지니어링을 위한 라이브러리
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

######################
### UTIL FUNCTIONS ###
######################

def proj_num_on_cat(train_df, test_df, target_column, group_column):
    # train_df : 훈련 데이터
    # test_df : 테스트 데이터
    # target_column : 통계기반 파생 변수를 생성한 타겟 변수
    # group_column : 피봇(pivot)을 수행할 변수
    train_df['row_id'] = range(train_df.shape[0])
    test_df['row_id'] = range(test_df.shape[0])
    train_df['train'] = 1
    test_df['train'] = 0

    # 훈련 데이터와 테스트 데이터를 통합한다
    all_df = train_df[['row_id', 'train', target_column, group_column]].append(test_df[['row_id','train', target_column, group_column]])
    
    # group_column 기반으로 피봇한 target_column의 값을 구한다 
    grouped = all_df[[target_column, group_column]].groupby(group_column)

    # 빈도(size), 평균(mean), 표준편차(std), 중간값(median), 최대값(max), 최소값(min)을 구한다
    the_size = pd.DataFrame(grouped.size()).reset_index()
    the_size.columns = [group_column, '%s_size' % target_column]
    the_mean = pd.DataFrame(grouped.mean()).reset_index()
    the_mean.columns = [group_column, '%s_mean' % target_column]
    the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
    the_std.columns = [group_column, '%s_std' % target_column]
    the_median = pd.DataFrame(grouped.median()).reset_index()
    the_median.columns = [group_column, '%s_median' % target_column]
    the_max = pd.DataFrame(grouped.max()).reset_index()
    the_max.columns = [group_column, '%s_max' % target_column]
    the_min = pd.DataFrame(grouped.min()).reset_index()
    the_min.columns = [group_column, '%s_min' % target_column]

    # 통계 기반 파생 변수를 취합한다
    the_stats = pd.merge(the_size, the_mean)
    the_stats = pd.merge(the_stats, the_std)
    the_stats = pd.merge(the_stats, the_median)
    the_stats = pd.merge(the_stats, the_max)
    the_stats = pd.merge(the_stats, the_min)
    all_df = pd.merge(all_df, the_stats, how='left')

    # 훈련 데이터와 테스트 데이터로 분리하여 반환한다
    selected_train = all_df[all_df['train'] == 1]
    selected_test = all_df[all_df['train'] == 0]
    selected_train.sort_values('row_id', inplace=True)
    selected_test.sort_values('row_id', inplace=True)
    selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
    selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
    selected_train, selected_test = np.array(selected_train), np.array(selected_test)
    return selected_train, selected_test


def interaction_features(train, test, fea1, fea2, prefix):
    # train : 훈련 데이터
    # test : 테스트 데이터
    # fea1, fea2 : 상호 작용을 수행할 변수 이름
    # prefix : 파생 변수의 변수 이름

    # 두 변수간의 곱셈/나눗셈 상호 작용에 대한 파생 변수를 생성한다
    train['inter_{}*'.format(prefix)] = train[fea1] * train[fea2]
    train['inter_{}/'.format(prefix)] = train[fea1] / train[fea2]

    test['inter_{}*'.format(prefix)] = test[fea1] * test[fea2]
    test['inter_{}/'.format(prefix)] = test[fea1] / test[fea2]

    return train, test





Using TensorFlow backend.


In [0]:
###########################
### FEATURE ENGINEERING ###
###########################

# 범주형 변수와 이진 변수 이름을 추출한다
cat_fea = [x for x in list(train) if 'cat' in x]
bin_fea = [x for x in list(train) if 'bin' in x]

# 결측값 (-1)의 개수로 missing 파생 변수를 생성한다
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)


In [0]:
# 6개 변수에 대하여 상호작용 변수를 생성한다
for e, (x, y) in enumerate(combinations(['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01'], 2)):
    train, test = interaction_features(train, test, x, y, e)

In [0]:
# 수치형 변수, 상호 작용 파생 변수, ind 변수 이름을 추출한다
num_features = [c for c in list(train) if ('cat' not in c and 'calc' not in c)]
num_features.append('missing')
inter_fea = [x for x in list(train) if 'inter' in x]
feature_names = list(train)
ind_features = [c for c in feature_names if 'ind' in c]

In [0]:
# ind 변수 그룹의 조합을 하나의 문자열 변수로 표현한다
count = 0
for c in ind_features:
    if count == 0:
        train['new_ind'] = train[c].astype(str)
        count += 1
    else:
        train['new_ind'] += '_' + train[c].astype(str)
ind_features = [c for c in feature_names if 'ind' in c]
count = 0
for c in ind_features:
    if count == 0:
        test['new_ind'] = test[c].astype(str)
        count += 1
    else:
        test['new_ind'] += '_' + test[c].astype(str)

# reg 변수 그룹의 조합을 하나의 문자열 변수로 표현한다
reg_features = [c for c in feature_names if 'reg' in c]
count = 0
for c in reg_features:
    if count == 0:
        train['new_reg'] = train[c].astype(str)
        count += 1
    else:
        train['new_reg'] += '_' + train[c].astype(str)
reg_features = [c for c in feature_names if 'reg' in c]
count = 0
for c in reg_features:
    if count == 0:
        test['new_reg'] = test[c].astype(str)
        count += 1
    else:
        test['new_reg'] += '_' + test[c].astype(str)

# car 변수 그룹의 조합을 하나의 문자열 변수로 표현한다
car_features = [c for c in feature_names if 'car' in c]
count = 0
for c in car_features:
    if count == 0:
        train['new_car'] = train[c].astype(str)
        count += 1
    else:
        train['new_car'] += '_' + train[c].astype(str)
car_features = [c for c in feature_names if 'car' in c]
count = 0
for c in car_features:
    if count == 0:
        test['new_car'] = test[c].astype(str)
        count += 1
    else:
        test['new_car'] += '_' + test[c].astype(str)


In [17]:
# 범주형 데이터와 수치형 데이터를 따로 관리한다
train_cat = train[cat_fea]
train_num = train[[x for x in list(train) if x in num_features]]
test_cat = test[cat_fea]
test_num = test[[x for x in list(train) if x in num_features]]

# 범주형 데이터에 LabelEncode()를 수행한다
max_cat_values = []
for c in cat_fea:
    le = LabelEncoder()
    x = le.fit_transform(pd.concat([train_cat, test_cat])[c])
    train_cat[c] = le.transform(train_cat[c])
    test_cat[c] = le.transform(test_cat[c])
    max_cat_values.append(np.max(x))

# 범주형 변수의 빈도값으로 새로운 파생 변수를 생성한다
cat_count_features = []
for c in cat_fea + ['new_ind','new_reg','new_car']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [38]:
train

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,...,inter_6*,inter_6/,inter_7*,inter_7/,inter_8*,inter_8/,inter_9*,inter_9/,inter_10*,inter_10/,inter_11*,inter_11/,inter_12*,inter_12/,inter_13*,inter_13/,inter_14*,inter_14/,new_ind,new_reg,new_car,ps_ind_02_cat_count,ps_ind_04_cat_count,ps_ind_05_cat_count,ps_car_01_cat_count,ps_car_02_cat_count,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,new_ind_count,new_reg_count,new_car_count,row_id,train
0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.718070,10,1,-1,0,1,4,1,0,0,1,12,2,0.400000,0.883679,0.370810,3.605551,0.6,0.5,0.2,...,55,0.454545,3.5,7.142857,10,2.500000,7.898774,0.065279,0.502649,1.025815,1.436141,0.359035,7.7,15.714286,22,5.500000,1.4,0.350000,2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0,0.7_0.2_0.7180703307999999,10_1_-1_0_1_4_1_0_0_1_12_2_0.4_0.8836789178_0....,309747,620936,1319412,124587,1234979,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326,6,24,1,0,1
1,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.449490,0.3,0.1,0.3,...,21,2.333333,5.6,8.750000,7,7.000000,2.298233,0.255359,0.612862,0.957597,0.766078,0.766078,2.4,3.750000,3,3.000000,0.8,0.800000,1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1,0.8_0.4_0.7660776723,11_1_-1_0_-1_11_1_1_2_1_19_3_0.316227766_0.618...,1079327,866864,1319412,518725,1234979,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535,36,38,11,1,1
2,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.000000,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,...,108,0.750000,0.0,inf,45,1.800000,-12.000000,-0.083333,-0.000000,-inf,-5.000000,-0.200000,0.0,inf,60,2.400000,0.0,0.000000,5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0,0.0_0.0_-1.0,7_1_-1_0_-1_14_1_1_2_1_60_1_0.316227766_0.6415...,28259,620936,1319412,449617,1234979,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943,24,13477,40,2,1
3,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.000000,0.6,0.9,0.1,...,16,0.250000,1.8,2.222222,0,inf,4.647580,0.072618,0.522853,0.645497,0.000000,inf,7.2,8.888889,0,inf,0.0,inf,0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0,0.9_0.2_0.5809475019,7_1_0_0_1_11_1_1_3_1_104_1_0.3741657387_0.5429...,1079327,866864,1319412,449617,1234979,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989,2784,222,1,3,1
4,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.316070,0.565832,0.365103,2.000000,0.4,0.6,0.0,...,0,0.000000,0.0,0.000000,0,,7.566827,0.093418,0.588531,1.201084,0.000000,inf,6.3,12.857143,0,inf,0.0,inf,0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0,0.7_0.6_0.840758586,11_1_-1_0_-1_14_1_1_2_1_82_3_0.3160696126_0.56...,309747,620936,1319412,518725,1234979,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161,258,34,13,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,3,1,10,0,0,0,0,0,1,0,0,0,0,0,13,1,0,0,0.5,0.3,0.692820,10,1,-1,0,1,1,1,1,0,1,31,3,0.374166,0.684631,0.385487,2.645751,0.4,0.5,0.3,...,130,0.769231,5.0,20.000000,30,3.333333,9.006664,0.053294,0.346410,1.385641,2.078461,0.230940,6.5,26.000000,39,4.333333,1.5,0.166667,3_1_10_0_0_0_0_0_1_0_0_0_0_0_13_1_0_0,0.5_0.3_0.692820323,10_1_-1_0_1_1_1_1_0_1_31_3_0.3741657387_0.6846...,1079327,866864,1319412,124587,1234979,1028142,1241334,431560,295574,1383070,1238365,486510,1475460,13143,117,17,3,595207,1
595208,5,1,3,0,0,0,0,0,1,0,0,0,0,0,6,1,0,0,0.9,0.7,1.382027,9,1,-1,0,-1,15,0,0,2,1,63,2,0.387298,0.972145,-1.000000,3.605551,0.2,0.2,0.0,...,18,0.500000,2.7,3.333333,15,0.600000,8.292165,0.230338,1.243825,1.535586,6.910137,0.276405,5.4,6.666667,30,1.200000,4.5,0.180000,5_1_3_0_0_0_0_0_1_0_0_0_0_0_6_1_0_0,0.9_0.7_1.3820274961,9_1_-1_0_-1_15_0_0_2_1_63_2_0.3872983346_0.972...,1079327,866864,1319412,50501,1234979,1028142,1241334,666910,54151,76138,249663,883326,1475460,2722,153,96,1,595208,1
595209,1,1,10,0,0,1,0,0,0,0,0,0,0,0,12,1,0,0,0.9,0.2,0.659071,7,1,-1,0,-1,1,1,1,2,1,31,3,0.397492,0.596373,0.398748,1.732051,0.4,0.0,0.3,...,120,0.833333,9.0,11.111111,10,10.000000,7.908856,0.054923,0.593164,0.732301,0.659071,0.659071,10.8,13.333333,12,12.000000,0.9,0.900000,1_1_10_0_0_1_0_0_0_0_0_0_0_0_12_1_0_0,0.9_0.2_0.6590713163,7_1_-1_0_-1_1_1_1_2_1_31_3_0.3974921383_0.5963...,1079327,866864,1319412,449617,1234979,1028142,1241334,666910,295574,1383070,1238365,883326,1475460,13143,382,223,3,595209,1
595210,5,2,3,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.9,0.4,0.698212,11,1,-1,0,-1,11,1,1,2,1,101,3,0.374166,0.764434,0.384968,3.162278,0.0,0.7,0.0,...,36,0.250000,2.7,3.333333,15,0.600000,8.378544,0.058184,0.628391,0.775791,3.491060,0.139642,10.8,13.333333,60,2.400000,4.5,0.180000,5_2_3_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0,0.9_0.4_0.6982120022,11_1_-1_0_-1_11_1_1_2_1_101_3_0.3741657387_0.7...,309747,620936,1319412,518725,1234979,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,18416,65,119,24,595210,1


In [44]:
(train['train']==1).sum()

595212

In [0]:
# XGBoost 기반 변수를 읽어온다
train_fea0, test_fea0 = pickle.load(open("../input/fea0.pk", "rb"))

In [0]:
# 수치형 변수의 결측값/이상값을 0으로 대체하고, 범주형 변수와 XGBoost 기반 변수를 통합한다
train_list = [train_num.replace([np.inf, -np.inf, np.nan], 0), train[cat_count_features], train_fea0]
test_list = [test_num.replace([np.inf, -np.inf, np.nan], 0), test[cat_count_features], test_fea0]


In [28]:
# 피봇 기반 기초 통계 파생 변수를 생성한다
for t in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01']:
    for g in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'ps_ind_05_cat']:
        if t != g:
          # group_column 변수를 기반으로 target_column 값을 피봇한 후, 기초 통계 값을 파생 변수로 추가한다
            s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g)
            train_list.append(s_train)
            test_list.append(s_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [40]:
s_train.shape

(595212, 6)

In [34]:
train_list[0]

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,missing,inter_0*,inter_0/,inter_1*,inter_1/,inter_2*,inter_2/,inter_3*,inter_3/,inter_4*,inter_4/,inter_5*,inter_5/,inter_6*,inter_6/,inter_7*,inter_7/,inter_8*,inter_8/,inter_9*,inter_9/,inter_10*,inter_10/,inter_11*,inter_11/,inter_12*,inter_12/,inter_13*,inter_13/,inter_14*,inter_14/
0,2,5,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.718070,2,0.400000,0.883679,0.370810,3.605551,1.0,4.418395,0.176736,0.634544,1.230630,9.720468,0.080334,0.618575,1.262398,1.767358,0.441839,3.590352,6.963106,55,0.454545,3.5,7.142857,10,2.500000,7.898774,0.065279,0.502649,1.025815,1.436141,0.359035,7.7,15.714286,22,5.500000,1.4,0.350000
1,1,7,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,3,0.316228,0.618817,0.388716,2.449490,2.0,4.331716,0.088402,0.474062,0.807773,1.856450,0.206272,0.495053,0.773521,0.618817,0.618817,5.362544,9.137455,21,2.333333,5.6,8.750000,7,7.000000,2.298233,0.255359,0.612862,0.957597,0.766078,0.766078,2.4,3.750000,3,3.000000,0.8,0.800000
2,5,9,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.000000,1,0.316228,0.641586,0.347275,3.316625,3.0,5.774271,0.071287,-0.641586,-0.641586,7.699029,0.053465,0.000000,0.000000,3.207929,0.128317,-9.000000,-9.000000,108,0.750000,0.0,0.000000,45,1.800000,-12.000000,-0.083333,-0.000000,0.000000,-5.000000,-0.200000,0.0,0.000000,60,2.400000,0.0,0.000000
3,0,2,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,1,0.374166,0.542949,0.294958,2.000000,0.0,1.085898,0.271474,0.315425,0.934592,4.343590,0.067869,0.488654,0.603276,0.000000,0.000000,1.161895,3.442652,16,0.250000,1.8,2.222222,0,0.000000,4.647580,0.072618,0.522853,0.645497,0.000000,0.000000,7.2,8.888889,0,0.000000,0.0,0.000000
4,0,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,3,0.316070,0.565832,0.365103,2.000000,2.0,0.000000,0.000000,0.475728,0.673001,5.092484,0.062870,0.396082,0.808331,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0,0.000000,7.566827,0.093418,0.588531,1.201084,0.000000,0.000000,6.3,12.857143,0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,3,10,0,0,0,1,0,0,0,0,0,13,1,0,0,0.5,0.3,0.692820,3,0.374166,0.684631,0.385487,2.645751,1.0,6.846306,0.068463,0.474326,0.988179,8.900197,0.052664,0.342315,1.369261,2.053892,0.228210,6.928203,14.433757,130,0.769231,5.0,20.000000,30,3.333333,9.006664,0.053294,0.346410,1.385641,2.078461,0.230940,6.5,26.000000,39,4.333333,1.5,0.166667
595208,5,3,0,0,0,1,0,0,0,0,0,6,1,0,0,0.9,0.7,1.382027,2,0.387298,0.972145,-1.000000,3.605551,3.0,2.916434,0.324048,1.343531,0.703419,5.832867,0.162024,0.874930,1.080161,4.860723,0.194429,4.146082,2.170724,18,0.500000,2.7,3.333333,15,0.600000,8.292165,0.230338,1.243825,1.535586,6.910137,0.276405,5.4,6.666667,30,1.200000,4.5,0.180000
595209,1,10,1,0,0,0,0,0,0,0,0,12,1,0,0,0.9,0.2,0.659071,3,0.397492,0.596373,0.398748,1.732051,2.0,5.963733,0.059637,0.393053,0.904869,7.156480,0.049698,0.536736,0.662637,0.596373,0.596373,6.590713,15.172865,120,0.833333,9.0,11.111111,10,10.000000,7.908856,0.054923,0.593164,0.732301,0.659071,0.659071,10.8,13.333333,12,12.000000,0.9,0.900000
595210,5,3,0,0,1,0,0,0,0,0,0,12,1,0,0,0.9,0.4,0.698212,3,0.374166,0.764434,0.384968,3.162278,2.0,2.293302,0.254811,0.533737,1.094845,9.173209,0.063703,0.687991,0.849371,3.822171,0.152887,2.094636,4.296689,36,0.250000,2.7,3.333333,15,0.600000,8.378544,0.058184,0.628391,0.775791,3.491060,0.139642,10.8,13.333333,60,2.400000,4.5,0.180000


In [35]:
train_list[1]

Unnamed: 0,ps_ind_02_cat_count,ps_ind_04_cat_count,ps_ind_05_cat_count,ps_car_01_cat_count,ps_car_02_cat_count,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,new_ind_count,new_reg_count,new_car_count
0,309747,620936,1319412,124587,1234979,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326,6,24,1
1,1079327,866864,1319412,518725,1234979,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535,36,38,11
2,28259,620936,1319412,449617,1234979,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943,24,13477,40
3,1079327,866864,1319412,449617,1234979,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989,2784,222,1
4,309747,620936,1319412,518725,1234979,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161,258,34,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,1079327,866864,1319412,124587,1234979,1028142,1241334,431560,295574,1383070,1238365,486510,1475460,13143,117,17,3
595208,1079327,866864,1319412,50501,1234979,1028142,1241334,666910,54151,76138,249663,883326,1475460,2722,153,96,1
595209,1079327,866864,1319412,449617,1234979,1028142,1241334,666910,295574,1383070,1238365,883326,1475460,13143,382,223,3
595210,309747,620936,1319412,518725,1234979,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,18416,65,119,24


In [37]:
print(len(train_list))

38


In [0]:
# 데이터 전체를 메모리 효율성을 위하여 희소행렬로 변환한다
X = sparse.hstack(train_list).tocsr()
X_test = sparse.hstack(test_list).tocsr()
all_data = np.vstack([X.toarray(), X_test.toarray()])

# 인공신경망 학습을 위해 모든 변수값을 -1~1로 Scaling한다
scaler = StandardScaler()
scaler.fit(all_data)
X = scaler.transform(X.toarray())
X_test = scaler.transform(X_test.toarray())

In [0]:
######################
### MODEL TRAINING ###
######################

# 2계층 인공 신경망 모델을 정의한다
def nn_model():
    inputs = []
    flatten_layers = []

    # 범주형 변수에 대한 Embedding 계층을 정의한다. 모든 범주형 변수는 해당 변수의 최대값(num_c) 크기의 벡터 임베딩을 학습한다.
    for e, c in enumerate(cat_fea):
        input_c = Input(shape=(1, ), dtype='int32')
        num_c = max_cat_values[e]
        embed_c = Embedding(
            num_c,
            6,
            input_length=1
        )(input_c)
        embed_c = Dropout(0.25)(embed_c)
        flatten_c = Flatten()(embed_c)

        inputs.append(input_c)
        flatten_layers.append(flatten_c)

    # 수치형 변수에 대한 입력 계층을 정의한다
    input_num = Input(shape=(X.shape[1],), dtype='float32')
    flatten_layers.append(input_num)
    inputs.append(input_num)

    # 범주형 변수와 수치형 변수를 통합하여 2계층 Fully Connected Layer를 정의한다
    flatten = merge(flatten_layers, mode='concat')

    # 1계층은 512 차원을 가지며, PReLU Activation 함수와 BatchNormalization, Dropout 함수를 통과한다
    fc1 = Dense(512, init='he_normal')(flatten)
    fc1 = PReLU()(fc1)
    fc1 = BatchNormalization()(fc1)
    fc1 = Dropout(0.75)(fc1)

    # 2계층은 64 차원을 가진다
    fc1 = Dense(64, init='he_normal')(fc1)
    fc1 = PReLU()(fc1)
    fc1 = BatchNormalization()(fc1)
    fc1 = Dropout(0.5)(fc1)

    outputs = Dense(1, init='he_normal', activation='sigmoid')(fc1)

    # 모델 학습을 수행하는 optimizer와 학습 기준이 되는 loss 함수를 정의한다
    model = Model(input = inputs, output = outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return (model)



In [0]:
# 5-Fold 교차 검증을 수행한다
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

# 모델 학습을 5번의 랜덤 시드로 수행한 후, 평균값을 최종 결과로 얻는다
num_seeds = 5
begintime = time()

# 내부 교차 검증 및 테스트 데이터에 대한 예측값을 저장하기 위한 준비를 한다
cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))

X_cat = train_cat.as_matrix()
X_test_cat = test_cat.as_matrix()

x_test_cat = []
for i in range(X_test_cat.shape[1]):
    x_test_cat.append(X_test_cat[:, i].reshape(-1, 1))
x_test_cat.append(X_test)



In [0]:
# 랜덤 시드 개수만큼 모델 학습을 수행한다
for s in range(num_seeds):
    np.random.seed(s)
    for (inTr, inTe) in kfold.split(X, train_label):
        xtr = X[inTr]
        ytr = train_label[inTr]
        xte = X[inTe]
        yte = train_label[inTe]

        xtr_cat = X_cat[inTr]
        xte_cat = X_cat[inTe]

        # 범주형 데이터를 추출하여, 수치형 데이터와 통합한다
        xtr_cat_list, xte_cat_list = [], []
        for i in range(xtr_cat.shape[1]):
            xtr_cat_list.append(xtr_cat[:, i].reshape(-1, 1))
            xte_cat_list.append(xte_cat[:, i].reshape(-1, 1))
        xtr_cat_list.append(xtr)
        xte_cat_list.append(xte)

        # 인공 신경망 모델을 정의한다
        model = nn_model()
        # 모델을 학습한다
        model.fit(xtr_cat_list, ytr, epochs=20, batch_size=512, verbose=2, validation_data=[xte_cat_list, yte])
        
        # 예측값의 순위를 구하는 함수 get_rank()를 정의한다. Gini 평가 함수는 예측값 간의 순위를 기준으로 평가하기 때문에 최종 평가 점수에 영향을 미치지 않는다.
        def get_rank(x):
            return pd.Series(x).rank(pct=True).values
        
        # 내부 교차 검증 데이터에 대한 예측값을 저장한다
        cv_train[inTe] += get_rank(model.predict(x=xte_cat_list, batch_size=512, verbose=0)[:, 0])
        print(Gini(train_label[inTe], cv_train[inTe]))
        
        # 테스트 데이터에 대한 예측값을 저장한다
        cv_pred += get_rank(model.predict(x=x_test_cat, batch_size=512, verbose=0)[:, 0])

    print(Gini(train_label, cv_train / (1. * (s + 1))))
    print(str(datetime.timedelta(seconds=time() - begintime)))
    


In [0]:
# 테스트 데이터에 대한 최종 예측값을 파일로 저장한다
pd.DataFrame({'id': test_id, 'target': get_rank(cv_pred * 1./ (NFOLDS * num_seeds))}).to_csv('../model/keras5_pred.csv', index=False)