In [1]:
import numpy as np
import pandas as pd
import copy
import pickle
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Label 형변환용 함수
def label_to_numeric(x):
        if x=='normal':  return 0
        if x=='out':     return 1
        if x=='in':      return 2
        if x=='noise':   return 3
        if x=='other':   return 4

def numeric_to_label(x):
        if x==0:   return 'normal'
        if x==1:   return 'out'
        if x==2:   return 'in'
        if x==3:   return 'noise'
        if x==4:   return 'other'

def pred_ensemble_1(pred_1, pred_2, proba_1, proba_2):
    
    pred = []
    proba = []
    for i in range(len(pred_1)):
        if pred_1[i] == pred_2[i]:
            pred.append(pred_1[i])
            proba.append(max(max(proba_1[i]),max(proba_2[i])))
        elif (pred_1[i] == 'in' or pred_1[i] == 'out') and (pred_2[i] != 'in' and pred_2[i] != 'out'):
            pred.append(pred_1[i])
            proba.append(max(proba_1[i]))
        elif (pred_2[i] == 'in' or pred_2[i] == 'out') and (pred_1[i] != 'in' and pred_1[i] != 'out'):
            pred.append(pred_2[i])
            proba.append(max(proba_2[i]))
        else:
            if max(proba_1[i]) > max(proba_2[i]):
                pred.append(pred_1[i])
                proba.append(max(proba_1[i]))
            else:
                pred.append(pred_2[i])
                proba.append(max(proba_2[i]))
                
    return pred, proba

def pred_ensemble_2(pred_1, pred_2, proba_1, proba_2):
    
    pred = []
    for i in range(len(pred_1)):
        if pred_1[i] == pred_2[i]:
            pred.append(pred_1[i])
        elif (pred_1[i] == 'in' or pred_1[i] == 'out') and (pred_2[i] != 'in' and pred_2[i] != 'out'):
            pred.append(pred_1[i])
        elif (pred_2[i] == 'in' or pred_2[i] == 'out') and (pred_1[i] != 'in' and pred_1[i] != 'out'):
            pred.append(pred_2[i])
        else:
            if proba_1[i] > proba_2[i]:
                pred.append(pred_1[i])
            else:
                pred.append(pred_2[i])
    return pred

##############################################################################################################################
##############################################################################################################################

# 1. 데이터 전처리
# data loding - 컬럼별 정규화 데이터셋
data = pd.read_csv('train/train.csv')
scaler = MinMaxScaler()
scaler.fit(data[data.columns[1:]])
data[data.columns[1:]] = scaler.transform(data[data.columns[1:]])
file_name = 'model/minmaxscaler.pkl'
joblib.dump(scaler, file_name)

# 전체 데이터셋을 이용한 train set
X = data[data.columns[1:]]
y = data[data.columns[0]]

sm = SMOTE(random_state=100)
X_resampled_all_1, y_resampled_all_1 = sm.fit_sample(X,y)
sm = SMOTE(random_state=10)
X_resampled_all_2, y_resampled_all_2 = sm.fit_sample(X,y)

# test set은 각 라벨별 최소한도(검증의 역할을 할 수 있는 범위에서)로 추출
data_0 = data.iloc[np.where(data['leaktype'] == 'out')].sample(n=100, random_state=100)
data.drop(data_0.index, inplace=True)
data_1 = data.iloc[np.where(data['leaktype'] == 'in')].sample(n=100, random_state=100)
data.drop(data_1.index, inplace=True)
data_2 = data.iloc[np.where(data['leaktype'] == 'normal')].sample(n=20, random_state=100)
data.drop(data_2.index, inplace=True)
data_3 = data.iloc[np.where(data['leaktype'] == 'other')].sample(n=100, random_state=100)
data.drop(data_3.index, inplace=True)
data_4 = data.iloc[np.where(data['leaktype'] == 'noise')].sample(n=100, random_state=100)
data.drop(data_4.index, inplace=True)

data_test = pd.concat([data_0, data_1], axis=0)
data_test = pd.concat([data_test, data_2], axis=0)
data_test = pd.concat([data_test, data_3], axis=0)
data_test = pd.concat([data_test, data_4], axis=0)

X_train = data[data.columns[1:]]
y_train = data[data.columns[0]]
X_test = data_test[data_test.columns[1:]]
y_test = data_test[data_test.columns[0]]

sm = SMOTE(random_state=100)
X_resampled, y_resampled = sm.fit_sample(X_train,y_train)

X_train = copy.deepcopy(X_resampled)
y_train = copy.deepcopy(y_resampled)

# data loding - 로우별 정규화 데이터셋
data = pd.read_csv('train/train.csv')
scaler = MinMaxScaler()
scaler.fit(data[data.columns[1:]].T)
data[data.columns[1:]] = scaler.transform(data[data.columns[1:]].T).T

# 전체 데이터셋을 이용한 train set
X_tsc = data[data.columns[1:]]
y_tsc = data[data.columns[0]]

sm = SMOTE(random_state=100)
X_resampled_tsc_all_1, y_resampled_tsc_all_1 = sm.fit_sample(X_tsc,y_tsc)
sm = SMOTE(random_state=10)
X_resampled_tsc_all_2, y_resampled_tsc_all_2 = sm.fit_sample(X_tsc,y_tsc)

# test set은 각 라벨별 최소한도(검증의 역할을 할 수 있는 범위에서)로 추출
data_0 = data.iloc[np.where(data['leaktype'] == 'out')].sample(n=100, random_state=100)
data.drop(data_0.index, inplace=True)
data_1 = data.iloc[np.where(data['leaktype'] == 'in')].sample(n=100, random_state=100)
data.drop(data_1.index, inplace=True)
data_2 = data.iloc[np.where(data['leaktype'] == 'normal')].sample(n=20, random_state=100)
data.drop(data_2.index, inplace=True)
data_3 = data.iloc[np.where(data['leaktype'] == 'other')].sample(n=100, random_state=100)
data.drop(data_3.index, inplace=True)
data_4 = data.iloc[np.where(data['leaktype'] == 'noise')].sample(n=100, random_state=100)
data.drop(data_4.index, inplace=True)

data_test = pd.concat([data_0, data_1], axis=0)
data_test = pd.concat([data_test, data_2], axis=0)
data_test = pd.concat([data_test, data_3], axis=0)
data_test = pd.concat([data_test, data_4], axis=0)

X_train_tsc = data[data.columns[1:]]
y_train_tsc = data[data.columns[0]]
X_test_tsc = data_test[data_test.columns[1:]]
y_test_tsc = data_test[data_test.columns[0]]

sm = SMOTE(random_state=100)
X_resampled_tsc, y_resampled_tsc = sm.fit_sample(X_train_tsc,y_train_tsc)

X_train_tsc = copy.deepcopy(X_resampled_tsc)
y_train_tsc = copy.deepcopy(y_resampled_tsc)

##############################################################################################################################
##############################################################################################################################

# 2. ML모델(KNN, XGB, RF 모델 앙상블)
KNN = KNeighborsClassifier(n_jobs=-1)
RF = RandomForestClassifier(n_jobs=-1, random_state=5023)
XGB = XGBClassifier(base_score=0.5, booster='gbtree',colsample_bylevel=1,colsample_bynode=1,colsample_bytree=1, gamma=0,
                    gpu_id=-1, importance_type='gain',interaction_constraints='',learning_rate=0.300000012,max_delta_step=0,
                    max_depth=6,min_child_weight=1,monotone_constraints='()',n_estimators=100,n_jobs=-1,num_parallel_tree=1,
                    objective='multi:softprob',random_state=5023, reg_alpha=0,reg_lambda=1, scale_pos_weight=None,
                    subsample=1, tree_method='auto',validate_parameters=1,verbosity=0)

votingC = VotingClassifier(estimators=[('knn', KNN), ('rf', RF), ('xgboost', XGB)], n_jobs=-1, voting='soft')
print("Voting Classifier 학습중...")
votingC.fit(X_resampled_all_1, y_resampled_all_1)

votingCPickle = open('model/votingC.h5', 'wb')
pickle.dump(votingC, votingCPickle)
print("Voting Classifier 저장\n")

##############################################################################################################################
##############################################################################################################################

# 3. DL모델(1dconv)
# 데이터 전처리(DL)

# 컬럼 정규화, train/test data set
X_train_dl = X_train.values
X_test_dl = X_test.values
X_train_dl = np.reshape(X_train_dl, (X_train_dl.shape[0],X_train_dl.shape[1],1), order='C')
X_test_dl = np.reshape(X_test_dl, (X_test_dl.shape[0],X_test_dl.shape[1],1), order='C')

y_train_dl = pd.DataFrame(y_train)['leaktype'].apply(label_to_numeric)
y_test_dl = pd.DataFrame(y_test)['leaktype'].apply(label_to_numeric)
y_train_dl = to_categorical(y_train_dl, num_classes = 5)
y_test_dl = to_categorical(y_test_dl, num_classes = 5)

# 컬럼 정규화, 전체 data set 1
X_train_dl_all_1 = X_resampled_all_1.values
X_train_dl_all_1 = np.reshape(X_train_dl_all_1, (X_train_dl_all_1.shape[0],X_train_dl_all_1.shape[1],1), order='C')

y_train_dl_all_1 = pd.DataFrame(y_resampled_all_1)['leaktype'].apply(label_to_numeric)
y_train_dl_all_1 = to_categorical(y_train_dl_all_1, num_classes = 5)

# 컬럼 정규화, 전체 data set 2
X_train_dl_all_2 = X_resampled_all_2.values
X_train_dl_all_2 = np.reshape(X_train_dl_all_2, (X_train_dl_all_2.shape[0],X_train_dl_all_2.shape[1],1), order='C')

y_train_dl_all_2 = pd.DataFrame(y_resampled_all_2)['leaktype'].apply(label_to_numeric)
y_train_dl_all_2 = to_categorical(y_train_dl_all_2, num_classes = 5)

# 로우 정규화, train/test data set
X_train_dl_tsc = X_train_tsc.values
X_test_dl_tsc = X_test_tsc.values
X_train_dl_tsc = np.reshape(X_train_dl_tsc, (X_train_dl_tsc.shape[0],X_train_dl_tsc.shape[1],1), order='C')
X_test_dl_tsc = np.reshape(X_test_dl_tsc, (X_test_dl_tsc.shape[0],X_test_dl_tsc.shape[1],1), order='C')

y_train_dl_tsc = pd.DataFrame(y_train_tsc)['leaktype'].apply(label_to_numeric)
y_test_dl_tsc = pd.DataFrame(y_test_tsc)['leaktype'].apply(label_to_numeric)
y_train_dl_tsc = to_categorical(y_train_dl_tsc, num_classes = 5)
y_test_dl_tsc = to_categorical(y_test_dl_tsc, num_classes = 5)

# 로우 정규화, 전체 data set 1
X_train_dl_tsc_all_1 = X_resampled_tsc_all_1.values
X_train_dl_tsc_all_1 = np.reshape(X_train_dl_tsc_all_1, (X_train_dl_tsc_all_1.shape[0],X_train_dl_tsc_all_1.shape[1],1), order='C')

y_train_dl_tsc_all_1 = pd.DataFrame(y_resampled_tsc_all_1)['leaktype'].apply(label_to_numeric)
y_train_dl_tsc_all_1 = to_categorical(y_train_dl_tsc_all_1, num_classes = 5)

# 로우 정규화, 전체 data set 2
X_train_dl_tsc_all_2 = X_resampled_tsc_all_2.values
X_train_dl_tsc_all_2 = np.reshape(X_train_dl_tsc_all_2, (X_train_dl_tsc_all_2.shape[0],X_train_dl_tsc_all_2.shape[1],1), order='C')

y_train_dl_tsc_all_2 = pd.DataFrame(y_resampled_tsc_all_2)['leaktype'].apply(label_to_numeric)
y_train_dl_tsc_all_2 = to_categorical(y_train_dl_tsc_all_2, num_classes = 5)

##############################################################################################################################
##############################################################################################################################

batch_size = 128
Epoch = 1000
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=20, min_lr=0.00001)
earlystopper = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", mode='max', patience=75, verbose=1)

# model_1(컬럼 정규화 데이터)

model_1=tf.keras.models.Sequential()
model_1.add(tf.keras.layers.Conv1D(64, 9, padding = 'causal', activation='relu', strides=1, input_shape=X_train_dl.shape[-2:]))
model_1.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Conv1D(128, 9, padding = 'causal', activation='relu', strides=1))
model_1.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Conv1D(256, 9, padding = 'causal', activation='relu', strides=1))
model_1.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Conv1D(256, 9, padding = 'causal', activation='relu', strides=1))
model_1.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Conv1D(128, 9, padding = 'causal', activation='relu', strides=1))
model_1.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Conv1D(64, 9, padding = 'causal', activation='relu', strides=1))
model_1.add(tf.keras.layers.GlobalMaxPooling1D())
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Dense(32, activation='relu'))
model_1.add(tf.keras.layers.Dropout(0.35))
model_1.add(tf.keras.layers.Dense(5, activation='softmax'))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model_1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

model_1.fit(X_train_dl,y_train_dl,validation_data=(X_test_dl,y_test_dl),epochs=Epoch,batch_size=batch_size,
            callbacks=[earlystopper, reduce_lr])
loss_ = model_1.evaluate(X_test_dl,y_test_dl)
print("model_1 loss :", loss_)

# 재학습1
model_1.fit(X_train_dl_all_1,y_train_dl_all_1,validation_data=(X_test_dl,y_test_dl), epochs=100, batch_size=batch_size)
print("model_1 재학습 1번 완료")

# 재학습2
model_1.fit(X_train_dl_all_2,y_train_dl_all_2,validation_data=(X_test_dl,y_test_dl), epochs=100, batch_size=batch_size)
print("model_1 재학습 2번 완료")

model_1.save('model/model_1.h5') # model_1 200번 재학습
print("model_1 저장\n")

# model_2(로우 정규화 데이터)

model_2=tf.keras.models.Sequential()
model_2.add(tf.keras.layers.Conv1D(64, 9, padding = 'causal', activation='relu', strides=1, input_shape=X_train_dl_tsc.shape[-2:]))
model_2.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Conv1D(128, 9, padding = 'causal', activation='relu', strides=1))
model_2.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Conv1D(256, 9, padding = 'causal', activation='relu', strides=1))
model_2.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Conv1D(256, 9, padding = 'causal', activation='relu', strides=1))
model_2.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Conv1D(128, 9, padding = 'causal', activation='relu', strides=1))
model_2.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Conv1D(64, 9, padding = 'causal', activation='relu', strides=1))
model_2.add(tf.keras.layers.GlobalMaxPooling1D())
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Dense(32, activation='relu'))
model_2.add(tf.keras.layers.Dropout(0.35))
model_2.add(tf.keras.layers.Dense(5, activation='softmax'))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model_2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

model_2.fit(X_train_dl_tsc,y_train_dl_tsc,validation_data=(X_test_dl_tsc,y_test_dl_tsc),epochs=Epoch,batch_size=batch_size,
            callbacks=[earlystopper, reduce_lr])
loss_ = model_2.evaluate(X_test_dl_tsc,y_test_dl_tsc)
print("model_2 loss :", loss_)

# 재학습1
model_2.fit(X_train_dl_tsc_all_1,y_train_dl_tsc_all_1,validation_data=(X_test_dl_tsc,y_test_dl_tsc), epochs=100, batch_size=batch_size)
print("model_2 재학습 1번 완료")

# 재학습2
model_2.fit(X_train_dl_tsc_all_2,y_train_dl_tsc_all_2,validation_data=(X_test_dl_tsc,y_test_dl_tsc), epochs=100, batch_size=batch_size)
print("model_2 재학습 2번 완료")

model_2.save('model/model_2.h5') # model_2 200번 재학습
print("model_2 저장\n")

# model_3(로우 정규화 데이터, class_weight적용)
d_class_weights = {0: 1, 1: 2, 2: 2, 3: 1.5, 4: 1.5}

model_3=tf.keras.models.Sequential()
model_3.add(tf.keras.layers.Conv1D(64, 9, padding = 'causal', activation='relu', strides=1, input_shape=X_train_dl_tsc.shape[-2:]))
model_3.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Conv1D(128, 9, padding = 'causal', activation='relu', strides=1))
model_3.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Conv1D(256, 9, padding = 'causal', activation='relu', strides=1))
model_3.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Conv1D(256, 9, padding = 'causal', activation='relu', strides=1))
model_3.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Conv1D(128, 9, padding = 'causal', activation='relu', strides=1))
model_3.add(tf.keras.layers.MaxPooling1D(padding = 'valid'))
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Conv1D(64, 9, padding = 'causal', activation='relu', strides=1))
model_3.add(tf.keras.layers.GlobalMaxPooling1D())
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Dense(32, activation='relu'))
model_3.add(tf.keras.layers.Dropout(0.35))
model_3.add(tf.keras.layers.Dense(5, activation='softmax'))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model_3.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

model_3.fit(X_train_dl_tsc,y_train_dl_tsc,validation_data=(X_test_dl_tsc,y_test_dl_tsc),epochs=Epoch,batch_size=batch_size,
            class_weight=d_class_weights, callbacks=[earlystopper, reduce_lr])
loss_ = model_3.evaluate(X_test_dl_tsc,y_test_dl_tsc)
print("model_3 loss :", loss_)

# 재학습1
model_3.fit(X_train_dl_tsc_all_1,y_train_dl_tsc_all_1,validation_data=(X_test_dl_tsc,y_test_dl_tsc), epochs=100, batch_size=batch_size)
print("model_3 재학습 1번 완료")

# model_3은 1차 재학습 단계에서 충분히 학습하여(편향적인 test set에 대하여 class_weight가 영향을 준것으로 예상) 2차는 진행하지 않음

model_3.save('model/model_3.h5') # model_3 100번 재학습
print("model_3 저장\n")

##############################################################################################################################
##############################################################################################################################

# 4. Inference - test set 예측 및 submission 생성
votingCPickle = open('model/votingC.h5', 'rb')
votingC = pickle.load(votingCPickle)
model_1 = tf.keras.models.load_model('model/model_1.h5')
model_2 = tf.keras.models.load_model('model/model_2.h5')
model_3 = tf.keras.models.load_model('model/model_3.h5')

# data loding
test = pd.read_csv('test/test.csv')

# 컬럼 정규화 test set
file_name = 'model/minmaxscaler.pkl'
scaler = joblib.load(file_name)
test[test.columns[1:]] = scaler.transform(test[test.columns[1:]])
X_submission_1 = test[test.columns[1:]]

# 로우 정규화 test set
scaler = MinMaxScaler()
scaler.fit(test[test.columns[1:]].T)
test[test.columns[1:]] = scaler.transform(test[test.columns[1:]].T).T
X_submission_2 = test[test.columns[1:]]

# 데이터 전처리 및 predict
# votingC (컬럼정규화)
y_pred_votingC = votingC.predict(X_submission_1)
y_proba_votingC = votingC.predict_proba(X_submission_1)

# DL 입력용 전처리과정(votingC 모델은 컬럼이름이 필요해서 먼저처리함)
X_submission_1 = X_submission_1.values
X_submission_2 = X_submission_2.values

# model_1 (컬럼정규화)
X_submission_dl_1 = np.reshape(X_submission_1, (X_submission_1.shape[0],X_submission_1.shape[1],1), order='C')
y_pred_dl_1 = model_1.predict_classes(X_submission_dl_1)
y_pred_dl_1 = pd.DataFrame(y_pred_dl_1)[0].apply(numeric_to_label)
y_proba_dl_1 = model_1.predict_proba(X_submission_dl_1)

# model_2 (로우정규화)
X_submission_dl_2 = np.reshape(X_submission_2, (X_submission_2.shape[0],X_submission_2.shape[1],1), order='C')
y_pred_dl_2 = model_2.predict_classes(X_submission_dl_2)
y_pred_dl_2 = pd.DataFrame(y_pred_dl_2)[0].apply(numeric_to_label)
y_proba_dl_2 = model_2.predict_proba(X_submission_dl_2)

# model_3 (로우정규화)
X_submission_dl_3 = np.reshape(X_submission_2, (X_submission_2.shape[0],X_submission_2.shape[1],1), order='C')
y_pred_dl_3 = model_3.predict_classes(X_submission_dl_3)
y_pred_dl_3 = pd.DataFrame(y_pred_dl_3)[0].apply(numeric_to_label)
y_proba_dl_3 = model_3.predict_proba(X_submission_dl_3)

y_pred_ensemble_1, y_proba_ensemble_1 = pred_ensemble_1(y_pred_dl_1, y_pred_dl_2, y_proba_dl_1, y_proba_dl_2)
y_pred_ensemble_2, y_proba_ensemble_2 = pred_ensemble_1(y_pred_dl_3, y_pred_votingC, y_proba_dl_3, y_proba_votingC)
predictions = pred_ensemble_2(y_pred_ensemble_1, y_pred_ensemble_2, y_proba_ensemble_1, y_proba_ensemble_2)

sample_submission = pd.read_csv('sample_submission.csv')
sorter = list(sample_submission['id'])
pred_df = pd.concat([test['id'], pd.DataFrame(predictions)],axis=1)
resdf = pred_df.set_index('id')
resdf.rename(columns={0:'leaktype'}, inplace=True)
result = resdf.loc[sorter].reset_index()
result.to_csv("submission/submission.csv", index = False)

2022-06-23 12:14:49.834520: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Voting Classifier 학습중...
Voting Classifier 저장



2022-06-23 12:37:28.086562: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-06-23 12:37:28.087221: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-23 12:37:28.088034: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:10:00.0 name: NVIDIA GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2022-06-23 12:37:28.088139: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-06-23 12:37:28.098617: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-06-23 12:37:28.098680: I tensorflow/stream_execut

Epoch 1/1000


2022-06-23 12:37:30.025798: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2022-06-23 12:37:30.501484: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8204
2022-06-23 12:37:30.983116: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-06-23 12:37:31.402755: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000


Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 12

Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
model_1 재학습 1번 완료
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Ep

Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
model_1 재학습 2번 완료
model_1 저장

Epoch 1/1000


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000


Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 1

Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/1000
Epoch 179/1000
Epoch 180/1000
Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 00214: early stopping
model_2 loss : [0.4729612171649933, 0.9095237851142883]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
E

Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
model_2 재학습 1번 완료
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Ep

Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
model_2 재학습 2번 완료
model_2 저장

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1

Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/

Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000
Epoch 222/1000
Epoch 223/1000
Epoch 224/1000
Epoch 225/1000
Epoch 226/1000
Epoch 227/1000
Epoch 228/1000
Epoch 229/1000
Epoch 230/1000
Epoch 231/1000
Epoch 232/1000
Epoch 233/1000
Epoch 234/1000
Epoch 235/1000
Epoch 236/1000
Epoch 237/1000
Epoch 238/1000
Epoch 239/1000
Epoch 240/1000
Epoch 241/1000
Epoch 242/1000
Epoch 00242: early stopping
model_3 loss : [0.45719778537750244, 0.9023809432983398]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/10

Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
model_3 재학습 1번 완료
model_3 저장





In [4]:
# data loding
test = pd.read_csv('1.누수감지데이터-통합(leaks-all).csv')
test = test[test.columns[5:-20]]

In [5]:
# scaler = StandardScaler()
file_name = 'model/minmaxscaler.pkl'
scaler = joblib.load(file_name)
test[test.columns[1:]] = scaler.transform(test[test.columns[1:]])
X_submission_1 = test[test.columns[1:]]

In [6]:
# data loding
test = pd.read_csv('1.누수감지데이터-통합(leaks-all).csv')
test = test[test.columns[5:-20]]

In [7]:
# 로우 정규화 test set
scaler = MinMaxScaler()
scaler.fit(test[test.columns[1:]].T)
test[test.columns[1:]] = scaler.transform(test[test.columns[1:]].T).T
X_submission_2 = test[test.columns[1:]]

In [8]:
# 데이터 전처리 및 predict
# votingC (컬럼정규화)
y_pred_votingC = votingC.predict(X_submission_1)
y_proba_votingC = votingC.predict_proba(X_submission_1)

# DL 입력용 전처리과정(votingC 모델은 컬럼이름이 필요해서 먼저처리함)
X_submission_1 = X_submission_1.values
X_submission_2 = X_submission_2.values

# model_1 (컬럼정규화)
X_submission_dl_1 = np.reshape(X_submission_1, (X_submission_1.shape[0],X_submission_1.shape[1],1), order='C')
y_pred_dl_1 = model_1.predict_classes(X_submission_dl_1)
y_pred_dl_1 = pd.DataFrame(y_pred_dl_1)[0].apply(numeric_to_label)
y_proba_dl_1 = model_1.predict_proba(X_submission_dl_1)

# model_2 (로우정규화)
X_submission_dl_2 = np.reshape(X_submission_2, (X_submission_2.shape[0],X_submission_2.shape[1],1), order='C')
y_pred_dl_2 = model_2.predict_classes(X_submission_dl_2)
y_pred_dl_2 = pd.DataFrame(y_pred_dl_2)[0].apply(numeric_to_label)
y_proba_dl_2 = model_2.predict_proba(X_submission_dl_2)

# model_3 (로우정규화)
X_submission_dl_3 = np.reshape(X_submission_2, (X_submission_2.shape[0],X_submission_2.shape[1],1), order='C')
y_pred_dl_3 = model_3.predict_classes(X_submission_dl_3)
y_pred_dl_3 = pd.DataFrame(y_pred_dl_3)[0].apply(numeric_to_label)
y_proba_dl_3 = model_3.predict_proba(X_submission_dl_3)

y_pred_ensemble_1, y_proba_ensemble_1 = pred_ensemble_1(y_pred_dl_1, y_pred_dl_2, y_proba_dl_1, y_proba_dl_2)
y_pred_ensemble_2, y_proba_ensemble_2 = pred_ensemble_1(y_pred_dl_3, y_pred_votingC, y_proba_dl_3, y_proba_votingC)
predictions = pred_ensemble_2(y_pred_ensemble_1, y_pred_ensemble_2, y_proba_ensemble_1, y_proba_ensemble_2)



In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, confusion_matrix, f1_score

In [11]:
acc_kn = round(accuracy_score(test[test.columns[0]], predictions), 2)
f1_kn = f1_score(test[test.columns[0]], predictions, average='macro')
acc_kn, f1_kn

(0.94, 0.93615042725311)