In [1]:
import numpy as np
import pandas as pd
import re


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from konlpy.tag import Okt
from nltk.corpus import stopwords

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
def get_text(text):
    text = re.sub(r"[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣 ]","", text)
    return text 

train['text'] = train.document.apply(lambda x : get_text(x))
test['text'] = test.document.apply(lambda x : get_text(x))

In [4]:
stopword = ['이','있','하','것','들','그','되','수','보','않','없','나','이나','을','를','은','는','가','에','에게','의','다','이다','하다']
X_trainbox = []
X_testbox = []
okt = Okt()
for sentence in train.text:
    token_sentence = okt.morphs(sentence, stem=True)
    except_stopword = [word for word in token_sentence if not word in stopword]
    X_trainbox.append(except_stopword)

for sentence in test.text:
    token_sentence = okt.morphs(sentence, stem=True)
    except_stopword = [word for word in token_sentence if not word in stopword]
    X_testbox.append(except_stopword)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_trainbox)

words_index = tokenizer.word_index
words_count = tokenizer.word_counts

In [6]:
before_X_train = tokenizer.texts_to_sequences(X_trainbox)
before_X_train

[[254, 180, 236, 623, 936, 328, 937, 193, 152],
 [2028, 1, 2, 750, 938, 379],
 [147, 180, 1, 116, 24, 216, 1571, 15, 20],
 [3062, 397, 1269, 84, 380, 129, 194, 5, 195],
 [129, 2029, 1, 91, 1, 120, 277, 266, 936, 751],
 [22, 121, 9, 3063, 2030, 1, 171, 3064, 12, 624, 292],
 [502, 1, 42, 60, 278, 3065, 535, 101],
 [165, 3066, 2, 574, 267, 1, 360],
 [118, 939, 503, 122, 463, 1],
 [145, 268, 504, 293, 830, 1079, 145],
 [329, 2, 752, 1, 625],
 [255, 3, 4, 31, 3, 4, 1080, 3, 4],
 [3067, 153, 3068, 3069, 3070, 361, 190, 3071, 22, 77, 190],
 [44, 256, 505, 753, 362, 17],
 [1081, 2031, 9, 1, 3072, 257, 21, 7, 6],
 [940, 3073, 1082, 831, 20, 1270, 3074, 3075],
 [204, 363, 464, 3, 1083, 2032, 2033],
 [31, 52, 2034, 4, 308, 55, 754, 364, 1572, 11, 1271, 61],
 [1084, 32, 9, 137, 46, 941, 172, 130, 1085],
 [27, 2035, 4, 79, 364, 2035, 27, 4, 27, 13, 3076],
 [1272, 1, 205, 1, 755, 1],
 [108, 173, 88, 62, 536, 78, 4, 832, 1],
 [506, 191, 942, 92, 1273],
 [138, 943, 98, 1, 465, 3077, 440, 679],
 [365, 

In [7]:
threshold = 0
unique_word = []
rare_unique, rare_cnt, total_cnt = 0, 0, 0

for key, value in words_count.items():
    if value <= threshold:
        rare_unique += 1
        rare_cnt += value
        unique_word.append(key)
        total_cnt += value
    else:
        total_cnt += value


In [8]:
print('레어한 단어 수: {num}'.format(num = rare_cnt))
print('레어한 단어의 비율: {rate}%'.format(rate = round((rare_cnt/total_cnt)*100, 2)))

print('리뷰의 최대 길이 {max}'.format(max = max(len(review) for review in X_trainbox)))
max_len = max(len(review) for review in X_trainbox)
voca_size = len(words_index) - rare_cnt + 1

레어한 단어 수: 0
레어한 단어의 비율: 0.0%
리뷰의 최대 길이 20


In [9]:
tokenizer = Tokenizer(voca_size)
tokenizer.fit_on_texts(X_trainbox)

words_index = tokenizer.word_index
words_count = tokenizer.word_counts

In [10]:
X_train = tokenizer.texts_to_sequences(X_trainbox)
X_test = tokenizer.texts_to_sequences(X_testbox)

y_train = train.label

In [11]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [14]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(voca_size, embedding_dim)) # word 의 feature를 만들어줌
model.add(Dropout(0.5))
model.add(Conv1D(128, kernel_size=5, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.5))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(LSTM(hidden_units))  
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         676600    
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 128)         131584    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)              

In [15]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=10, callbacks=[es, mc], batch_size=16, validation_split=0.2)

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.81300, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.81300 to 0.82700, saving model to best_model.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.82700 to 0.84100, saving model to best_model.h5
Epoch 4/10

Epoch 00004: val_acc improved from 0.84100 to 0.84800, saving model to best_model.h5
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.84800
Epoch 6/10

Epoch 00006: val_acc improved from 0.84800 to 0.85100, saving model to best_model.h5
Epoch 7/10

Epoch 00007: val_acc improved from 0.85100 to 0.85200, saving model to best_model.h5
Epoch 8/10

Epoch 00008: val_acc improved from 0.85200 to 0.86000, saving model to best_model.h5
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.86000
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.86000


In [16]:
loaded_model = load_model('best_model.h5')
pred = loaded_model.predict(X_test)

In [18]:
pred = np.around(pred).astype(int)

submission = pd.read_csv('../Data/sample_submission.csv')

In [19]:
submission['label'] = pred
submission

Unnamed: 0,id,label
0,1,0
1,2,1
2,3,0
3,4,1
4,5,1
...,...,...
4995,4996,0
4996,4997,0
4997,4998,1
4998,4999,0


In [20]:
submission.to_csv('../Data/submissionLSTM+CNN.csv', index = False)