# DNN Based Models

#### Import modules

In [1]:
import pandas as pd
import numpy as np
import os
import random
import pickle
from tqdm import tqdm
from IPython.display import Image
import seaborn as sns
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
%matplotlib inline
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
import keras.backend as K

print(tf.__version__)

2.3.1


#### Set random seeds to make your results reproducible

In [2]:
# 매번 모델링을 할 때마다 동일한 결과를 얻으려면 아래 코드를 실행해야 함.

def reset_seeds(seed, reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(seed)
    random.seed(seed)
    tf.compat.v1.set_random_seed(seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET")  # optional
   
reset_seeds(1)

RANDOM SEEDS RESET


### Load Data

In [3]:
df = pd.read_pickle(os.path.abspath("../dat")+'/feature_4.pkl')
test = pd.read_csv(os.path.abspath("../dat")+'/test.csv', encoding='cp949')
df

Unnamed: 0,month,item_cnt_month_lag_1,item_cnt_week6_lag_1,week2_avg_item_cnt_lag_2,item_cnt_month_lag_4,week2_avg_item_cnt_lag_6,week4_avg_item_cnt_lag_4,month_avg_item_cnt_lag_4,week4_avg_item_cnt_lag_3,item_cnt_month_lag_5,...,category_국내문학,week1_avg_item_cnt_lag_5,item_cnt_month_lag_6,week6_cat_avg_item_cnt_lag_1,category_자기계발,week3_cat_avg_item_cnt_lag_1,week5_cat_avg_item_cnt_lag_1,category_만화/라이트노벨,item_cnt_month,item_cnt_month.1
0,7,0.0,0.0,0.393939,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,1.0,1.0
1,7,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,7.0,7.0
2,7,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,1.0,1.0
3,7,0.0,0.0,0.000000,1.0,0.0,0.384352,2.16449,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,1.0,1.0
4,7,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118342,12,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
118343,12,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
118344,12,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
118345,12,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


### Split Data

아래와 같이 데이터를 분할:
- Test Data: 12월 데이터
- Validation Data: 11월 데이터
- Train Data: 5~10월 데이터

In [4]:
X_train = df[df.month < 12].drop(['item_cnt_month'], axis=1)
y_train = df[df.month < 12]['item_cnt_month'] # 11월까지 데이터를 하나로 묶에서 검증데이터로 만듦
X_test = df[df.month == 12].drop(['item_cnt_month'], axis=1)

In [5]:
# Numerical features 숫자형
X_train = X_train.drop(['month'], axis=1)
X_test = X_test.drop(['month'], axis=1)

In [6]:
X_train.shape, X_test.shape

((104866, 30), (13481, 30))

### Train & Evaluate the Model

Scale Data

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Random seed 변경을 통한 다수의 DNN 모델 생성

In [8]:
# 예측값을 저장할 폴더 생성
folder = '../dat/Ensemble2'
if not os.path.isdir(folder):
    os.mkdir(folder)

In [9]:
for i in tqdm(range(20)):    
    SEED = np.random.randint(1, 10000)              
    reset_seeds(SEED)
    
    # Define the NN architecture
    input = keras.Input(shape=(X_train.shape[1],))
    x = keras.layers.Dense(64, activation='elu')(input)
    x = keras.layers.Dropout(0.2)(x)
    x1 = keras.layers.Dense(64)(x)
    x = keras.layers.Add()([x1,x])
    x = keras.layers.Dense(32, activation='elu')(x)
    x = keras.layers.Dropout(0.2)(x)
    x1 = keras.layers.Dense(32)(x)
    x = keras.layers.Add()([x1,x])
    x = keras.layers.Dense(16, activation='elu')(x)
    x = keras.layers.Dropout(0.2)(x)
    x1 = keras.layers.Dense(16)(x)
    x = keras.layers.Add()([x1,x])
    output = keras.layers.Dense(1, activation='relu')(x)
    model = keras.Model(input,output)    
    
    # Choose the optimizer and the cost function
    model.compile(loss='mse', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    # Train the model
    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=200)]
    hist = model.fit([X_train, y_train], y_train, validation_split = 0.3,  batch_size=2048, epochs=200, 
                 callbacks=callbacks, shuffle=False, verbose=0)
    
    # Make submissions
    submission = pd.DataFrame({
        "item_id": test.item_id, 
        "item_cnt_month": model.predict(X_test).clip(0, 20).flatten()
    })
    t = pd.Timestamp.now()
    fname = f"{folder}/loop_submission_{t.month:02}{t.day:02}_{SEED:05}.csv"
    submission.to_csv(fname, index=False)    

  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

RANDOM SEEDS RESET


  5%|████▏                                                                              | 1/20 [00:53<16:56, 53.52s/it]

RANDOM SEEDS RESET


 10%|████████▎                                                                          | 2/20 [01:46<15:59, 53.30s/it]

RANDOM SEEDS RESET


 15%|████████████▍                                                                      | 3/20 [02:37<14:53, 52.54s/it]

RANDOM SEEDS RESET


 20%|████████████████▌                                                                  | 4/20 [03:26<13:45, 51.61s/it]

RANDOM SEEDS RESET


 25%|████████████████████▊                                                              | 5/20 [04:15<12:42, 50.85s/it]

RANDOM SEEDS RESET


 30%|████████████████████████▉                                                          | 6/20 [05:06<11:53, 50.95s/it]

RANDOM SEEDS RESET


 35%|█████████████████████████████                                                      | 7/20 [05:57<10:59, 50.75s/it]

RANDOM SEEDS RESET


 40%|█████████████████████████████████▏                                                 | 8/20 [06:48<10:12, 51.03s/it]

RANDOM SEEDS RESET


 45%|█████████████████████████████████████▎                                             | 9/20 [07:40<09:22, 51.13s/it]

RANDOM SEEDS RESET


 50%|█████████████████████████████████████████                                         | 10/20 [08:31<08:31, 51.17s/it]

RANDOM SEEDS RESET


 55%|█████████████████████████████████████████████                                     | 11/20 [09:22<07:40, 51.16s/it]

RANDOM SEEDS RESET


 60%|█████████████████████████████████████████████████▏                                | 12/20 [10:14<06:51, 51.42s/it]

RANDOM SEEDS RESET


 65%|█████████████████████████████████████████████████████▎                            | 13/20 [11:06<06:01, 51.64s/it]

RANDOM SEEDS RESET


 70%|█████████████████████████████████████████████████████████▍                        | 14/20 [11:59<05:11, 51.96s/it]

RANDOM SEEDS RESET


 75%|█████████████████████████████████████████████████████████████▌                    | 15/20 [12:49<04:16, 51.39s/it]

RANDOM SEEDS RESET


 80%|█████████████████████████████████████████████████████████████████▌                | 16/20 [13:40<03:25, 51.26s/it]

RANDOM SEEDS RESET


 85%|█████████████████████████████████████████████████████████████████████▋            | 17/20 [14:31<02:33, 51.19s/it]

RANDOM SEEDS RESET


 90%|█████████████████████████████████████████████████████████████████████████▊        | 18/20 [15:21<01:41, 50.91s/it]

RANDOM SEEDS RESET


 95%|█████████████████████████████████████████████████████████████████████████████▉    | 19/20 [16:12<00:50, 50.93s/it]

RANDOM SEEDS RESET


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [17:03<00:00, 51.16s/it]


생성된 다수의 DNN 모형을 power mean 앙상블

In [10]:
nf = 0
for f in os.listdir(folder):
    ext = os.path.splitext(f)[-1]
    if ext == '.csv': 
        s = pd.read_csv(folder+"/"+f)
    else: 
        continue
    if len(s.columns) !=2:
        continue
    if nf == 0: 
        slist = s
    else: 
        slist = pd.merge(slist, s, on="item_id")
    nf += 1

p = 20 # 이 값에 따라 성능이 달라짐 (p=0: 기하평균, p=1: 산술평균)    
if nf >= 2:
    if p == 0: 
        pred = 1
        for j in range(nf): pred = pred * slist.iloc[:,j+1]
        pred = pred**(1/nf)
    else:
        pred = 0
        for j in range(nf): pred = pred + slist.iloc[:,j+1]**p
        pred = pred / nf
        pred = pred**(1/p)
    submission = pd.DataFrame({'item_id': slist.item_id, 'item_cnt_month': pred})
   
    submission.to_csv('submit_2.csv', index=False)

# End