In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
with open('2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [3]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 <= score <= 9:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, 0
    # 9 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 9 else 0)

In [4]:
filtered_words = [doc.strip().split() for doc in filtered_texts]

In [5]:
print(filtered_words[:2])

[['진심', '쓰레기', '영화', '만들', '무서', '알', '쫄아', '틀었', '이건', '뭐', '웃', '거리', '없는', '쓰레기', '영화', '임'], ['역대', '좀비', '영화', '가장', '최고다', '원작', '만화', '읽어', '보려', '영화', '보고', '결정', '하려', '감독', '간츠', '실사', '했', '사람', '거르려', '그냥', '봤', '정말', '흠잡', '없는', '최고', '좀비', '영화', '잔인', '거', '싫어하지', '참고', '볼', '만하', '로미', '인물', '왜', '그런', '모르']]


In [8]:
total_words = []
for words in filtered_words:
    total_words.extend(words)

In [9]:
from collections import Counter
c = Counter(total_words)

In [10]:
max_features = 10000
common_words = [ word for word, count in c.most_common(max_features)]

In [11]:
words_dic ={}
for index, word in enumerate(common_words):
    words_dic[word]=index+1

In [12]:
filtered_indexed_words = []
for review in filtered_words:
    indexed_words=[]
    for word in review:
        try:
            indexed_words.append(words_dic[word])
        except:
            pass
    filtered_indexed_words.append(indexed_words)

In [13]:
from tensorflow.keras.preprocessing import sequence
max_len = 90
X = sequence.pad_sequences(filtered_indexed_words, maxlen=max_len)

In [14]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(filtered_labels)

In [15]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2)

In [21]:
len(y_test)

109682

In [16]:
from tensorflow.keras import layers
from tensorflow.keras import models

In [17]:
model = models.Sequential()
model.add(layers.Embedding(max_features+1, 32))
model.add(layers.LSTM(32)) #(32*32+32*32+32) * 4 = 8320
model.add(layers.Dense(2, activation = 'softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          320032    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 2)                 66        
Total params: 328,418
Trainable params: 328,418
Non-trainable params: 0
_________________________________________________________________


In [18]:
from tensorflow.keras.optimizers import RMSprop
model.compile(optimizer=RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc'])

In [19]:
model.fit(X_train, y_train, epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x25d22f45fd0>

In [20]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('test_acc:', test_acc)

test_acc: 0.9412027597427368
