In [2]:
from konlpy.tag import Okt
okt = Okt()

file = open('./ratings_train.txt', 'r', encoding='utf-8-sig')

sentence = []
label = []

for idx, line in enumerate(file):
    if idx == 0:     # 첫 번째 줄은 열의 label이 들어있는 Line
        continue
    line = line.split('\t')
    sentence.append(okt.morphs(line[1].strip()))
    label.append(line[2].strip())

In [3]:
print(sentence[0], label[0])

file.close()

['아', '더빙', '..', '진짜', '짜증나네요', '목소리'] 0


In [5]:
file = open('./ratings_test.txt', 'r', encoding='utf-8-sig')

test_sentence = []
test_label = []

for idx, line in enumerate(file):
    if idx == 0:     # 첫 번째 줄은 열의 label이 들어있는 Line
        continue
    line = line.split('\t')
    test_sentence.append(okt.morphs(line[1].strip()))
    test_label.append(line[2].strip())

In [6]:
print(test_sentence[0], test_label[0])

file.close()

['굳', 'ㅋ'] 1


In [7]:
all_sentence = sentence + test_sentence         # train에 비해 test 문장이 더 길면 모델 오류가 나므로 통합적으로 관리

max_len = max([len(i) for i in all_sentence])

In [8]:
vocab = set()

for line in all_sentence:
    for word in line:
        vocab.add(word)
        
vocab_size = len(vocab) + 1
vocab = sorted(list(vocab))

In [10]:
vocab_index = {}

for i in range(len(vocab)):
    vocab_index[vocab[i]] = len(vocab_index)+1

In [11]:
X_train = []
for line in sentence:
    etc = []
    for word in line:
        etc.append(vocab_index[word])
    X_train.append(etc)

In [13]:
Y_train = []
for line in test_sentence:
    etc = []
    for word in line:
        etc.append(vocab_index[word])
    Y_train.append(etc)

In [15]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

X_train = pad_sequences(X_train, padding = 'post', maxlen = max_len)
Y_train = pad_sequences(Y_train, padding = 'post', maxlen = max_len)

In [17]:
X_train, X_cv, train_label, cv_label = train_test_split(X_train, label, test_size = 0.1)

In [25]:
from keras.utils.np_utils import to_categorical

train_label = to_categorical(train_label)
cv_label = to_categorical(cv_label)
test_label = to_categorical(test_label)

In [28]:
from keras.layers import Input, Dense, Conv1D, Embedding, GlobalMaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping

input_sentence = Input(shape=(max_len,))

emd = Embedding(vocab_size, 100)(input_sentence)

conv1d = Conv1D(32, 3, activation='relu', strides=1)(emd)

GMP = GlobalMaxPooling1D()(conv1d)

output = Dense(2, activation='softmax')(GMP)

model = Model(inputs=[input_sentence], outputs=output)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss',mode='min', verbose=1, patience=2, restore_best_weights=True)

model.summary()

model.fit(X_train, train_label, batch_size=256, epochs=30, validation_data=(X_cv, cv_label), callbacks=[es])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 95)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 95, 100)           12289600  
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 93, 32)            9632      
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 66        
Total params: 12,299,298
Trainable params: 12,299,298
Non-trainable params: 0
_________________________________________________________________
Train on 135000 samples, validate on 15000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Restoring model weights from the en

<keras.callbacks.History at 0x2903733dcc0>

In [29]:
evaluation = model.evaluate(Y_train, test_label)

print('Accuracy: '+str(evaluation[1]))
print('Loss: '+str(evaluation[0]))

Accuracy: 0.85504
Loss: 0.3446316457033157


In [31]:
y_pred = model.predict([Y_train])
Y_pred = y_pred.round()

from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(test_label, axis=1)

t = confusion_matrix(y_test, y_pred)

print(classification_report(
    y_test, y_pred,
    target_names=["0","1"]
))

              precision    recall  f1-score   support

           0       0.84      0.87      0.86     24827
           1       0.87      0.84      0.85     25173

   micro avg       0.86      0.86      0.86     50000
   macro avg       0.86      0.86      0.86     50000
weighted avg       0.86      0.86      0.86     50000

