<a href="https://colab.research.google.com/github/KRiver28/TIL/blob/master/4_11_naver_movie(data).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install sentencepiece

import pandas as pd
import numpy as np
import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import pickle
from tqdm.auto import tqdm



In [8]:
# Commented out IPython magic to ensure Python compatibility.

DATA_PATH = '/content/drive/MyDrive/BIGDATA_STUDY/NLP/'

train_data = pd.read_csv(DATA_PATH + 'ratings_train.txt', sep='\t')
test_data = pd.read_csv(DATA_PATH + 'ratings_test.txt', sep='\t')
train_data = train_data.dropna()
test_data = test_data.dropna()

train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [9]:
# 기호, 숫자, 영어 등은 제외하고 한글만 사용한다.
train_list = [re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\s]", "", x) for x in train_data['document']]
test_list = [re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\s]", "", x) for x in test_data['document']]



In [10]:
# Sentencepice용 사전을 만들기 위해 train_list, test_list를 저장해 둔다.
data_file = "/content/drive/MyDrive/BIGDATA_STUDY/NLP//naver_data.txt"
with open(data_file, 'w', encoding='utf-8') as f:
    for sent in train_list + test_list:
        f.write(sent + '\n')
      

In [12]:
# Google의 Sentencepiece를 이용해서 vocabulary를 생성한다.
templates= "--input={} \
            --pad_id=0 --pad_piece=<PAD>\
            --unk_id=1 --unk_piece=<UNK>\
            --bos_id=2 --bos_piece=<BOS>\
            --eos_id=3 --eos_piece=<EOS>\
            --model_prefix={} \
            --vocab_size={}"

VOCAB_SIZE = 10000

model_prefix = "/content/drive/MyDrive/BIGDATA_STUDY/NLP//naver_model"
params = templates.format(data_file, model_prefix, VOCAB_SIZE)

spm.SentencePieceTrainer.Train(params)
sp = spm.SentencePieceProcessor()
sp.Load(model_prefix + '.model')

True

In [13]:
with open(model_prefix + '.vocab', encoding='utf-8') as f:
    vocab = [doc.strip().split('\t') for doc in f]

word2idx = {k:v for v, [k, _] in enumerate(vocab)}
idx2word = {v:k for v, [k, _] in enumerate(vocab)}

In [14]:
# 리뷰 문장을 사전의 인덱스로 표시
train_seq = [sp.encode_as_ids(x) for x in train_list]
test_seq = [sp.encode_as_ids(x) for x in test_list]

In [15]:
# 육안 확인용
sentence = train_list[0]
idx = train_seq[0]
enc = sp.encode_as_pieces(sentence)
dec = sp.decode_ids(idx)

print(sentence)
print(enc)
print(idx)
print([idx2word[x] for x in idx])
print(dec)

아 더빙 진짜 짜증나네요 목소리
['▁아', '▁더빙', '▁진짜', '▁짜증나', '네요', '▁목소리']
[56, 922, 24, 1904, 58, 1474]
['▁아', '▁더빙', '▁진짜', '▁짜증나', '네요', '▁목소리']
아 더빙 진짜 짜증나네요 목소리


In [16]:
MAX_SEQ_LEN = 12  # 문장 최대 길이
x_train = pad_sequences(train_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
x_test = pad_sequences(test_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

In [17]:
# 학습 데이터를 저장해 둔다.
with open('/content/drive/MyDrive/BIGDATA_STUDY/NLP//naver_sentencepiece.pkl', 'wb') as f:
    pickle.dump([x_train, x_test, y_train, y_test, word2idx], f, pickle.DEFAULT_PROTOCOL)

x_train[3]

array([2241,    8,  185,  203, 7330,  297, 1141,  100,  608,  249,  115,
          0], dtype=int32)