In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils.np_utils import to_categorical

from keras.models import Model
from keras.layers import Reshape, Input
from keras.layers import Dense, Activation, Flatten, Convolution1D, Dropout,MaxPooling1D,GlobalAveragePooling1D, MaxPool1D
from keras.layers.normalization import BatchNormalization

import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint

train_data= pd.read_csv('/kaggle/input/hanwhasystemict2020fest/heartbeat.csv')
test_data= pd.read_csv('/kaggle/input/hanwhasystemict2020fest/heartbeat_question.csv')

In [2]:
# Train Data, Test Data ID 컬럼 삭제

train_data.drop('Id', axis = 1, inplace = True)
test_data.drop('Id', axis = 1, inplace = True)

In [3]:
# Train Data의 Category를 숫자형 데이터로 변환
for i in range(len(train_data['Category'])):
    if(train_data['Category'][i] == 'N'):
        train_data['Category'][i] = 0
    elif(train_data['Category'][i] == 'S'):
        train_data['Category'][i] = 1
    elif(train_data['Category'][i] == 'V'):
        train_data['Category'][i] = 2
    elif(train_data['Category'][i] == 'F'):
        train_data['Category'][i] = 3
    elif(train_data['Category'][i] == 'Q'):
        train_data['Category'][i] = 4

In [4]:
# Oversampling을 위해 가장 큰 클래스를 기준으로 각 클래스별 반복수를 설정
con = train_data.sample(frac=1, axis=0)
val = con.values

x = val[:,1:]
y = val[:,0].astype(int)

x_temp = []
count_temp = []

for label in range(5):
    x_i = x[y == label]
    x_temp.append(x_i)
    count_temp.append(len(x_i))
    
counts = (np.floor(max(count_temp) / np.array(count_temp))).astype(int)

In [5]:
# 반복 수 만큼 데이터를 늘리는 Oversampling 적용

for label in range(5):
    count = counts[label]
    if label == 0:
        x_bal = x_temp[label]
        y_bal = np.zeros((count_temp[label])).astype(int)
        count -= 1

    for j in range(count):
        x_bal = np.concatenate((x_bal, x_temp[label]), axis=0)
        y_bal = np.concatenate((y_bal, np.zeros((count_temp[label])).astype(int) + label))

In [6]:
# Train Data와 Label Data를 통해 Test 데이터 설정 (비율 7.5/ 2.5)
train = pd.DataFrame(x_bal)
label = pd.DataFrame(y_bal)

X_train, X_test, y_train, y_test = train_test_split(train, label, test_size = 0.25, random_state = 0)

In [7]:
# CNN 모델 정의

def network(X_train,y_train,X_test,y_test):
    
    im_shape=(X_train.shape[1],1)
    inputs_cnn=Input(shape=(im_shape), name='inputs_cnn')
    
    conv1_1=Convolution1D(64, (6), activation='relu', input_shape=im_shape)(inputs_cnn)
    conv1_1=BatchNormalization()(conv1_1)
    pool1=MaxPool1D(pool_size=(3), strides=(2), padding="same")(conv1_1)
    
    conv2_1=Convolution1D(64, (3), activation='relu', input_shape=im_shape)(pool1)
    conv2_1=BatchNormalization()(conv2_1)
    pool2=MaxPool1D(pool_size=(2), strides=(2), padding="same")(conv2_1)
    
    conv3_1=Convolution1D(64, (3), activation='relu', input_shape=im_shape)(pool2)
    conv3_1=BatchNormalization()(conv3_1)
    pool3=MaxPool1D(pool_size=(2), strides=(2), padding="same")(conv3_1)

    flatten=Flatten()(pool3)
    
    dense_end1 = Dense(64, activation='relu')(flatten)
    dense_end2 = Dense(32, activation='relu')(dense_end1)
    main_output = Dense(5, activation='softmax', name='main_output')(dense_end2)
    
    model = Model(inputs= inputs_cnn, outputs=main_output)
    model.compile(optimizer='adam', loss='categorical_crossentropy',metrics = ['accuracy'])
    
    callbacks = [EarlyStopping(monitor='val_loss', patience=8),
             ModelCheckpoint(filepath='best_model.h6', monitor='val_loss', save_best_only=True)]

    history=model.fit(X_train, y_train,epochs=100,callbacks=callbacks, batch_size=128,validation_data=(X_test,y_test))
    model.load_weights('best_model.h6')
    return(model,history)

In [8]:
# 평가 함수 정의
def evaluate_model(history,X_test,y_test,model):
    scores = model.evaluate((X_test),y_test, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    
    print(history)
    fig1, ax_acc = plt.subplots()
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Model - Accuracy')
    plt.legend(['Training', 'Validation'], loc='lower right')
    plt.show()
    
    fig2, ax_loss = plt.subplots()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Model- Loss')
    plt.legend(['Training', 'Validation'], loc='upper right')
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.show()
    target_names=['0','1','2','3','4']
    
    y_true=[]
    for element in y_test:
        y_true.append(np.argmax(element))
    prediction_proba=model.predict(X_test)
    prediction=np.argmax(prediction_proba,axis=1)
    cnf_matrix = confusion_matrix(y_true, prediction)

In [9]:
# CNN을 위한 Label의 Category화 및 reshape
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

X_train = X_train.reshape(len(X_train), X_train.shape[1],1)
X_test = X_test.reshape(len(X_test), X_test.shape[1],1)

In [10]:
# 데이터 형태 확인
y_train.shape, y_test.shape, X_train.shape, X_test.shape

In [11]:
# 모델 학습 및 평가
model,history=network(X_train,y_train,X_test,y_test)

In [12]:
evaluate_model(history,X_test,y_test,model)

In [13]:
# 구축한 모델을 활용하여 답안 예측
solv=np.asarray(test_data)
solv=solv.reshape(21892, 187, 1)
ans = model.predict(solv)

# 예측한 답안을 원 문제 형식에 맞추어 변형
s1 = pd.Series([])

for i in range(ans.shape[0]):
    decode = pd.Series(np.argmax(ans[i]))
    s1 = s1.append(decode, ignore_index=True)
    
tt = pd.DataFrame(s1,columns=['Category'])

for i in range(len(tt['Category'])):
    if(tt['Category'][i] == 0):
        tt['Category'][i] = 'N'
    elif(tt['Category'][i] == 1):
        tt['Category'][i] = 'S'
    elif(tt['Category'][i] == 2):
        tt['Category'][i] = 'V'
    elif(tt['Category'][i] == 3):
        tt['Category'][i] = 'F'
    elif(tt['Category'][i] == 4):
        tt['Category'][i] = 'Q'

# 답안 csv 파일 생성
t = pd.read_csv('/kaggle/input/hanwhasystemict2020fest/heartbeat_question.csv')
idindex = t['Id']
index = pd.DataFrame(idindex)
ans = index.join(tt)

ans.to_csv('mySolution.csv', index=False)