In [1]:
from konlpy.tag import Okt
okt=Okt()

file = open('filename', 'r', encoding='utf-8-sig')

sentences = []
label = []


for idx, line in enumerate(file):
    if idx == 0:
        continue
    line = line.split('\t')
    sentences.append(line[1])
    label.append(int(line[2].strip()))

In [2]:
sentences_pos = []

for line in sentences:
    sentences_pos.append(okt.morphs(line))

In [3]:
max_len = max([len(i) for i in sentences_pos])

vocab = set()
for line in sentences_pos:
    for word in line:
        vocab.add(word)

In [4]:
vocab_size = len(vocab)+1

vocab = sorted(list(vocab))

vocab_index = {}
for i in range(len(vocab)):
    vocab_index[vocab[i]] = len(vocab_index)+1

In [5]:
int_sentences = []

for line in sentences_pos:
    etc = []
    for word in line:
        etc.append(vocab_index[word])
    int_sentences.append(etc)

In [6]:
from keras.preprocessing.sequence import pad_sequences

padding_sentences = []

int_sentences = pad_sequences(int_sentences, padding='post', maxlen=max_len)

Using TensorFlow backend.


In [7]:
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

label = to_categorical(label)

X_train, X_etc, y_train, y_etc = train_test_split(int_sentences, label, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, test_size=0.5, random_state=42)

In [8]:
from keras import layers
from keras.models import Sequential
from keras.models import Model
import keras

embedding_size = 50

input_data = layers.Input(shape=(max_len, ))

emd = layers.Embedding(vocab_size, 50)(input_data)

cnn3 = layers.Conv1D(filters=100, kernel_size=3, strides=1, padding='valid', activation='relu')(emd)
cnn4 = layers.Conv1D(filters=100, kernel_size=4, strides=1, padding='valid', activation='relu')(emd)
cnn5 = layers.Conv1D(filters=100, kernel_size=5, strides=1, padding='valid', activation='relu')(emd)

pooling3 = layers.GlobalMaxPooling1D()(cnn3)
pooling4 = layers.GlobalMaxPooling1D()(cnn4)
pooling5 = layers.GlobalMaxPooling1D()(cnn5)

concat = layers.Concatenate(axis=-1)([pooling3, pooling4, pooling5])

drop = layers.Dropout(0.5)(concat)

dense = layers.Dense(50, activation='relu')(drop)

output = layers.Dense(2, activation='softmax')(dense)

optimizer = keras.optimizers.Adadelta()

model = Model(inputs=input_data, outputs=output)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 82)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 82, 50)       2840600     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 80, 100)      15100       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 79, 100)      20100       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [9]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss',mode='min', verbose=1, patience=3, restore_best_weights=True)

model.fit([X_train], y_train, batch_size=64, epochs=100, validation_data=([X_val], y_val), callbacks=[es])

Train on 30000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Restoring model weights from the end of the best epoch
Epoch 00008: early stopping


<keras.callbacks.History at 0x1fc77afb4e0>

In [10]:
evaluation = model.evaluate([X_test], y_test)

print('Accuracy: '+str(evaluation[1]))
print('Loss: '+str(evaluation[0]))

Accuracy: 0.825
Loss: 0.40077012593746186
