In [5]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [6]:
train_data = pd.read_table('data/ratings_train.txt')
test_data = pd.read_table('data/ratings_test.txt')

In [7]:
train_data.drop_duplicates(subset=['document'], inplace=True) 

In [8]:
train_data = train_data.dropna(how = 'any')

In [9]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [10]:
train_data['document'].replace('', np.nan, inplace=True)

In [11]:
train_data = train_data.dropna(how = 'any')

In [12]:
test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_data = test_data.dropna(how='any') # Null 값 제거

In [13]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [15]:
okt = Okt()

In [16]:
X_train = []
for sentence in train_data['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)

In [17]:
X_test = []
for sentence in test_data['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_test.append(temp_X)

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [19]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)


단어 집합(vocabulary)의 크기 : 43752
등장 빈도가 2번 이하인 희귀 단어의 수: 24337
단어 집합에서 희귀 단어의 비율: 55.62488571950996
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 1.8715872104872904


In [20]:
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 :',vocab_size)

단어 집합의 크기 : 19417


In [21]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [22]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

In [23]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

In [24]:
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)
print(len(X_train))
print(len(y_train))

145380
145380


  return array(a, dtype, copy=False, order=order)


In [25]:
max_len = 30

In [26]:
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [4]:
embedding_dim = 128
dropout_prob = (0.5, 0.8)
num_filters = 128

In [27]:
model_input = Input(shape = (max_len,))
z = Embedding(vocab_size, embedding_dim, input_length = max_len, name='embedding')(model_input)
z = Dropout(dropout_prob[0])(z)

In [29]:
conv_blocks = []

for sz in [3,4,5]:
    conv = Conv1D(filters = num_filters,
                kernel_size=sz,
                padding='valid',
                activation='relu',
                strides=1)(z)
    conv = GlobalMaxPooling1D()(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

In [32]:
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)
z = Dense(128, activation='relu')(z)
model_output = Dense(1, activation='sigmoid')(z)

model = Model(model_input, model_output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [38]:
es = EarlyStopping(monitor ='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('model/1d_cnn_naver_movie.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test), verbose=1, callbacks=[es,mc])

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.83943, saving model to model/1d_cnn_naver_movie.h5
Epoch 2/10
Epoch 00002: val_acc improved from 0.83943 to 0.84313, saving model to model/1d_cnn_naver_movie.h5
Epoch 3/10
Epoch 00003: val_acc did not improve from 0.84313
Epoch 4/10
Epoch 00004: val_acc did not improve from 0.84313
Epoch 5/10
Epoch 00005: val_acc did not improve from 0.84313
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.84313
Epoch 7/10
Epoch 00007: val_acc did not improve from 0.84313
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f96dcbe0eb0>

In [39]:
loaded_model = load_model('model/1d_cnn_naver_movie.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8431


In [40]:
def sentiment_predict(new_sentence):
    new_sentence = okt.morphs(new_sentence, stem = True)
    new_sentence = [word for word in new_sentence if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([new_sentence])
    pad_new = pad_sequences(encoded, maxlen =max_len)
    score = float(model.predict(pad_new))
    if (score > 0.5):
        print("{:.2f}% 확률로 긍정 리뷰".format(score*100))
    else:
        print("{:.2f}% 확률로 부정 리뷰".format((1-score)*100))

In [41]:
sentiment_predict("이 영화 개꿀잼")

92.35% 확률로 긍정 리뷰


In [44]:
sentiment_predict("시간 가는줄 모르고 봄")

57.04% 확률로 긍정 리뷰


In [45]:
sentiment_predict('감독 뭐하는 놈이냐?')

88.46% 확률로 부정 리뷰
