# Proj: 네이버 영화 리뷰 감성 분류에 SentencePiece적용하기

- 네이버 영화리뷰 감정 분석 코퍼스에 SentencePiece를 적용시킨 모델 학습하기
- 학습된 모델로 sp_tokenize() 메소드 구현하기
- 구현된 토크나이저를 적용하여 네이버 영화리뷰 감정 분석 모델을 재학습하기
- KoNLPy 형태소 분석기를 사용한 모델과 성능 비교하기
- (보너스) SentencePiece 모델의 model_type, vocab_size 등을 변경해 가면서 성능 개선 여부 확인하기

In [None]:
!wget https://github.com/jungyeul/korean-parallel-corpora/raw/master/korean-english-news-v1/korean-english-park.train.tar.gz

--2022-05-18 05:20:18--  https://github.com/jungyeul/korean-parallel-corpora/raw/master/korean-english-news-v1/korean-english-park.train.tar.gz
Resolving github.com (github.com)... 13.114.40.48
Connecting to github.com (github.com)|13.114.40.48|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/master/korean-english-news-v1/korean-english-park.train.tar.gz [following]
--2022-05-18 05:20:18--  https://raw.githubusercontent.com/jungyeul/korean-parallel-corpora/master/korean-english-news-v1/korean-english-park.train.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8718893 (8.3M) [application/octet-stream]
Saving to: ‘korean-english-park.train.tar.gz’


2022-05-1

In [None]:
!tar -xzvf korean-english-park.train.tar.gz

korean-english-park.train.en
korean-english-park.train.ko


In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
import sentencepiece as spm
import pandas as pd
import csv

In [None]:
# spm.SentencePieceTrainer.train(
#     f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" 
#     + " --model_type=bpe"
#     + " --max_sentence_length=999999" # 문장 최대 길이
#     + " --pad_id=0 --pad_piece=[PAD]" # pad (0)
#     + " --unk_id=1 --unk_piece=[UNK]" # unknown (1)
#     + " --bos_id=2 --bos_piece=[BOS]" # begin of sequence (2)
#     + " --eos_id=3 --eos_piece=[EOS]" # end of sequence (3)
#     + " --user_defined_symbols=[SEP],[CLS],[MASK]" # 사용자 정의 토큰
# )

In [None]:
corpus = "korean-english-park.train.ko"
prefix = "news-v1"
vocab_size = 8000

spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" # 문장 최대 길이
)

In [None]:
vocab_list = pd.read_csv('news-v1.vocab',sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.head(10)

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,▁이,0
4,했다,-1
5,▁있,-2
6,에서,-3
7,▁대,-4
8,▁사,-5
9,▁지,-6


In [None]:
sp = spm.SentencePieceProcessor()
sp.load('news-v1.model')

True

In [None]:
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
import re
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt") # train
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt") # test
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt") # train + test

('ratings.txt', <http.client.HTTPMessage at 0x7f6d10183890>)

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [None]:
stopwords = ['의','가','이','은','들','는','▁좀','▁잘','▁걍','과','도','를','으로','▁자','에','와','한','하다']

In [None]:
def tokenize_and_remove_stopwords(data, stopwords, tokenizer):
    result = []

    for sentence in data:
        curr_data = []
        curr_data = tokenizer.encode_as_pieces(sentence) # tokenization
        curr_data = [word for word in curr_data if not word in stopwords] # 불용어 제거
        result.append(curr_data)
    return result

In [None]:
def load_data(train_data, test_data, num_words=10000):
    # 중복 제거
    train_data.drop_duplicates(subset=['document'], inplace=True)
    test_data.drop_duplicates(subset=['document'], inplace=True)

    # 결측치 제거
    train_data = train_data.dropna(how='any')
    test_data = test_data.dropna(how='any')

    # 토큰화 및 불용어 제거
    x_train = tokenize_and_remove_stopwords(train_data['document'],stopwords, sp)
    x_test = tokenize_and_remove_stopwords(test_data['document'],stopwords, sp)

    # 단어장 만드는 중
    words = np.concatenate(x_train).tolist()
    counter = Counter(words)
    counter = counter.most_common(10000-4)
    vocab = ['<PAD>','<BOS>','<UNK>','<UNUSED>'] + [key for key,_ in counter]
    word_to_index = {word:index for index, word in enumerate(vocab)}

    def wordlist_to_indexlist(wordlist):
        return [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in wordlist]

    x_train = list(map(wordlist_to_indexlist, x_train))
    x_test = list(map(wordlist_to_indexlist, x_test))

    return x_train, np.array(list(train_data['label'])), x_test, np.array(list(test_data['label'])), word_to_index

In [None]:
x_train, y_train, x_test, y_test, word_to_index = load_data(train_data, test_data)

In [None]:
print(x_train[10])

[4, 844, 31, 472, 40, 2529, 404, 107, 5, 82, 36, 404, 107, 917]


In [None]:
index_to_word = {index:word for word,index in word_to_index.items()}

In [None]:
print(index_to_word)

{0: '<PAD>', 1: '<BOS>', 2: '<UNK>', 3: '<UNUSED>', 4: '▁', 5: '.', 6: '..', 7: '▁영화', 8: '!', 9: '다', 10: '요', 11: '...', 12: ',', 13: '▁이', 14: '고', 15: '지', 16: '?', 17: '게', 18: '네', 19: '나', 20: '~', 21: '을', 22: '어', 23: '만', 24: '영화', 25: '기', 26: '서', 27: '▁그', 28: '점', 29: '거', 30: '리', 31: '인', 32: '아', 33: '▁아', 34: '로', 35: '음', 36: '짜', 37: '▁너무', 38: '라', 39: '데', 40: '니', 41: '▁재미', 42: '▁진', 43: '▁재', 44: '는데', 45: '▁정말', 46: '밌', 47: '하', 48: '작', 49: '하고', 50: '▁안', 51: '해', 52: '까', 53: '면', 54: ';', 55: '없', 56: '▁봤', 57: '건', 58: '있', 59: '보', 60: '▁스', 61: '야', 62: '▁평', 63: '수', 64: '▁어', 65: '▁감', 66: '▁보', 67: '시', 68: '▁좋', 69: '▁지', 70: '하는', 71: '▁다', 72: '것', 73: '루', 74: '히', 75: '자', 76: '▁나', 77: '봐', 78: '▁무', 79: '대', 80: '토', 81: '었', 82: '진', 83: '연', 84: '동', 85: '▁이런', 86: '구', 87: '▁연', 88: '에서', 89: '마', 90: '▁더', 91: '정', 92: '말', 93: '할', 94: '▁한', 95: '▁왜', 96: '주', 97: '미', 98: '장', 99: '영', 100: '레', 101: '스', 102: '여', 103: '때', 104: '러', 1

In [None]:
s = spm.SentencePieceProcessor()
s.Load('news-v1.model')

def sp_tokenize(s, corpus):

    tensor = []

    for sen in corpus:
        tensor.append(s.EncodeAsIds(sen))

    with open("./news-v1.vocab", 'r') as f:
        vocab = f.readlines()

    word_index = {}
    index_word = {}

    for idx, line in enumerate(vocab):
        word = line.split("\t")[0]

        word_index.update({idx:word})
        index_word.update({word:idx})

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, word_index, index_word

In [None]:
# 데이터 셋 내 문장 길이 분포
total_data_text = list(x_train) + list(x_test)

# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)

# 문장 길이의 평균값, 최대값, 표준편차를 계산
print("문장길이 평균 :", np.mean(num_tokens))
print("문장길이 최대 :", np.max(num_tokens))
print("문장길이 표준편차 :", np.std(num_tokens))

# 예를 들면 최대길이를 (평균 + 2*표준편차)로 한다면,
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens)

maxlen = int(max_tokens)
print("pad_sequence maxlen :", maxlen)
print("전체 문장의 {:.2}%가 maxlen 설정값 이내에 포함됩니다.".format(np.sum(num_tokens < max_tokens)/len(num_tokens)))


문장길이 평균 : 20.713462237443622
문장길이 최대 : 140
문장길이 표준편차 : 16.425012529667296
pad_sequence maxlen : 53
전체 문장의 0.94%가 maxlen 설정값 이내에 포함됩니다.


In [None]:
# 패딩 추가
x_train = pad_sequences(x_train, value=word_to_index['<PAD>'], padding='pre', maxlen=maxlen)
x_test = pad_sequences(x_test, value=word_to_index['<PAD>'], padding='pre', maxlen=maxlen)

In [None]:
print(x_train.shape)
print(x_test.shape)

(146182, 53)
(49157, 53)


In [None]:
vocab_size = 10000
word_vector_dim = 200 # 보통 2의 배수로 지정

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 200)         2000000   
                                                                 
 lstm (LSTM)                 (None, 8)                 6688      
                                                                 
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 2,006,769
Trainable params: 2,006,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
x_val = x_train[:50000]
y_val = y_train[:50000]

partial_x_train = x_train[50000:]
partial_y_train = y_train[50000:]

In [None]:
model.compile(
    optimizer= 'adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
epochs = 50
hist = model.fit(partial_x_train, partial_y_train, epochs=epochs, batch_size=32, validation_data=(x_val, y_val))
model.save('my_model.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
 551/3006 [====>.........................] - ETA: 34s - loss: 0.0209 - accuracy: 0.9943

In [None]:
result = model.evaluate(x_test, y_test, verbose=2)

In [None]:
print(result)

In [None]:
history_dict = hist.history
print(history_dict.keys())

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

In [None]:
plt.plot(loss,c='b',label='Traing loss')
plt.plot(val_loss,c='r',label='validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

In [None]:
plt.plot(acc,c='b',label='Traing acc')
plt.plot(val_acc,c='r',label='validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('acc')
plt.legend()
plt.show()