# 데모

## 라이브러리 import 및 설정

In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import seaborn as sns
from tensorflow import keras
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model, to_categorical
import warnings

  import pandas.util.testing as tm


In [7]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

[03-pandas-eda.ipynb](https://github.com/kaggler-tv/dku-kaggle-class/blob/master/notebook/03-pandas-eda.ipynb)에서 생성한 `feature.csv` 피처파일 사용

In [22]:
data_dir = Path('DATA')
feature_dir = Path('DATA')
val_dir = Path('val')
tst_dir = Path('tst')
sub_dir = Path('DATA')

trn_file = data_dir /'train.csv'
tst_file = data_dir /'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'class'
n_fold = 5
n_class = 3
seed = 42

In [23]:
algo_name = 'nncv'
feature_name = 'polyfeature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

## Polynomial Feature 생성

In [24]:
df = pd.read_csv(data_dir/'train.csv', index_col=0)
print(df.shape)
df.head()

(320000, 19)


Unnamed: 0_level_0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,23.263956,20.336773,19.0095,17.672439,16.939607,-8.1e-05,23.12426,20.25779,18.95512,17.63211,16.90894,18,18,1.189764,1.190681,1.188979,1.189355,1.190206,0
1,15.052147,14.061969,13.452418,13.26845,13.168941,0.004506,14.96642,14.00452,13.41139,13.23625,13.13468,1,1,1.253254,1.257836,1.248762,1.250997,1.255533,1
2,16.786385,15.825426,15.536318,15.393535,15.350032,0.000472,16.60765,15.68659,15.44004,15.32173,15.29608,2,2,1.022499,1.024105,1.020983,1.02173,1.023291,0
3,25.660638,21.188727,20.221158,19.894949,19.634649,6e-06,25.35365,20.99465,20.08727,19.79465,19.55518,4,3,1.205399,1.206058,1.204874,1.20512,1.205712,0
4,24.453429,20.69917,19.042368,18.324152,17.982649,-3.3e-05,23.7714,20.43384,18.86299,18.19028,17.87592,13,12,1.193946,1.194285,1.193738,1.193826,1.194099,0


In [25]:
scaler = StandardScaler()
poly = PolynomialFeatures(2)
X = poly.fit_transform(scaler.fit_transform(df.drop(target_col, axis=1)))
feature_names = poly.get_feature_names(df.columns)
feature_names

['1',
 'u',
 'g',
 'r',
 'i',
 'z',
 'redshift',
 'dered_u',
 'dered_g',
 'dered_r',
 'dered_i',
 'dered_z',
 'nObserve',
 'nDetect',
 'airmass_u',
 'airmass_g',
 'airmass_r',
 'airmass_i',
 'airmass_z',
 'u^2',
 'u g',
 'u r',
 'u i',
 'u z',
 'u redshift',
 'u dered_u',
 'u dered_g',
 'u dered_r',
 'u dered_i',
 'u dered_z',
 'u nObserve',
 'u nDetect',
 'u airmass_u',
 'u airmass_g',
 'u airmass_r',
 'u airmass_i',
 'u airmass_z',
 'g^2',
 'g r',
 'g i',
 'g z',
 'g redshift',
 'g dered_u',
 'g dered_g',
 'g dered_r',
 'g dered_i',
 'g dered_z',
 'g nObserve',
 'g nDetect',
 'g airmass_u',
 'g airmass_g',
 'g airmass_r',
 'g airmass_i',
 'g airmass_z',
 'r^2',
 'r i',
 'r z',
 'r redshift',
 'r dered_u',
 'r dered_g',
 'r dered_r',
 'r dered_i',
 'r dered_z',
 'r nObserve',
 'r nDetect',
 'r airmass_u',
 'r airmass_g',
 'r airmass_r',
 'r airmass_i',
 'r airmass_z',
 'i^2',
 'i z',
 'i redshift',
 'i dered_u',
 'i dered_g',
 'i dered_r',
 'i dered_i',
 'i dered_z',
 'i nObserve',
 '

In [26]:
df_poly = pd.DataFrame(data=X, columns=feature_names, index=df.index)
df_poly[target_col] = df[target_col]
df_poly.head()
df_poly.to_csv(feature_file)

In [27]:
del df_poly, df

In [28]:
df = pd.read_csv(feature_file, index_col=0)
print(df.shape)
df.head()

(320000, 191)


Unnamed: 0_level_0,1,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,...,airmass_g airmass_r,airmass_g airmass_i,airmass_g airmass_z,airmass_r^2,airmass_r airmass_i,airmass_r airmass_z,airmass_i^2,airmass_i airmass_z,airmass_z^2,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.753507,0.062999,0.868712,0.009479,0.001127,-0.101465,1.817122,0.114171,0.909484,...,0.014469,0.014454,0.014404,0.01457,0.014554,0.014504,0.014539,0.014489,0.014439,0
1,1.0,-2.46732,-0.138694,-2.921312,-0.074309,-0.064856,-0.093494,-2.423838,-0.237402,-2.854036,...,0.441995,0.450625,0.466759,0.411921,0.419964,0.435001,0.428164,0.443494,0.459373,1
2,1.0,-1.575931,-0.082011,-1.500057,-0.033878,-0.026689,-0.100504,-1.570623,-0.142833,-1.476829,...,1.73467,1.718216,1.684203,1.805405,1.78828,1.75288,1.771317,1.736252,1.701882,0
3,1.0,2.985389,0.090383,1.695083,0.051763,0.048288,-0.101314,2.9761,0.155599,1.678076,...,0.064843,0.064301,0.063158,0.067213,0.066652,0.065466,0.066094,0.064919,0.063765,0
4,1.0,2.36489,0.074647,0.891128,0.021878,0.019379,-0.101382,2.153547,0.124069,0.846939,...,0.024393,0.023957,0.02307,0.026305,0.025835,0.024878,0.025374,0.024434,0.023529,0


In [29]:
y = df[target_col].values[:320000]
df.drop(target_col, axis=1, inplace=True)
trn = df.iloc[:320000].values
tst = df.iloc[320000:].values
feature_name = df.columns.tolist()
print(y.shape, trn.shape, tst.shape)

(320000,) (320000, 190) (0, 190)


## Stratified K-Fold Cross Validation

In [30]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

## Keras Model

### Sequential API

In [31]:
def get_model():
    model = Sequential()

    model.add(Dense(units=64, activation='relu'))
    model.add(Dense(units=n_class, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')
    return model

### Functional API

In [32]:
def get_model():
    inputs = Input(shape=(len(feature_name),))
    x = Dense(64, activation='relu')(inputs)
    outputs = Dense(n_class, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')
    return model

## 신경망 모델 학습

In [36]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                            patience=3, min_lr=1e-6, mode='min', verbose=1)

    clf.fit(trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn[i_val], to_categorical(y[i_val])),
            epochs=2, # 에포크는 일반적으로 10 정도로 둠
            callbacks=[es, rlr])
    p_val[i_val, :] = clf.predict(trn[i_val])
#     p_tst += clf.predict(tst) / n_fold

training model for CV #1
Train on 256000 samples, validate on 64000 samples
Epoch 1/2
Epoch 2/2
training model for CV #2
Train on 256000 samples, validate on 64000 samples
Epoch 1/2
Epoch 2/2
training model for CV #3
Train on 256000 samples, validate on 64000 samples
Epoch 1/2
Epoch 2/2
training model for CV #4
Train on 256000 samples, validate on 64000 samples
Epoch 1/2
Epoch 2/2
training model for CV #5
Train on 256000 samples, validate on 64000 samples
Epoch 1/2
Epoch 2/2


In [37]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

90.7678%


In [38]:
print(p_val.shape, p_tst.shape)

(320000, 3) (0, 3)


In [39]:
# np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
# np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

## 신경망 모델 시각화

In [40]:
clf.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 190)]             0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                12224     
_________________________________________________________________
dense_15 (Dense)             (None, 3)                 195       
Total params: 12,419
Trainable params: 12,419
Non-trainable params: 0
_________________________________________________________________


## 제출 파일 생성

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[target_col] = np.argmax(p_tst, axis=1)
sub.head()

In [None]:
sub[target_col].value_counts()

In [None]:
sub.to_csv(sub_file)