All rights reserved, 2021, By Youn-Sik Hong. 수업 목적으로만 활용 가능.

- 참고 서적
    - Python Machine Learning(3rd Ed.), Sebastian Raschka , Vahid Mirjalili, Packt, 2019.10.
        - 8장. Applying Machine Learning to Sentiment Analysis 코드 참조.
    - 텐서플로2와 머신러닝으로 시작하는 자연어 처리, 전창욱, 최태균, 조중현, 신성진 지음, 위키북스, 2020.
        - 4장. 텍스트 분류 예제 참조.

In [None]:
import re
import csv
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from konlpy.tag import Okt

from tensorflow.python.keras.preprocessing.text import Tokenizer

In [None]:
DATA_IN_PATH = './data_in/'
train_file = DATA_IN_PATH + 'ratings_train.txt'
train_data = pd.read_csv(train_file, header=0, delimiter='\t', quoting=csv.QUOTE_NONE)

In [None]:
train_data.head()

In [None]:
# 정규표현(re)을 사용해 아래에 해당하지 않는 기호는 모두 제거
# 한글음절 :음절 11,174자 ('가'-'힣'), 자음('ㄱ'-'ㅎ'), 모음('ㅏ'-'ㅣ'), whitespace char(\s)
print(train_data['document'][0])
review_text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]', '', train_data['document'][0])
print(review_text)

In [None]:
print(review_text)
okt = Okt()
review_text = okt.morphs(review_text, stem=True)
print(review_text)

In [None]:
with open('./kr_stopwords.txt', encoding='utf8') as f:
    stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]    
print(stopwords[:10])

In [None]:
print(review_text)
revised_text = [w for w in review_text if len(w) > 1]
clean_review = [w for w in revised_text if not w in stopwords]
print(clean_review)

In [None]:
def preprocessing(review, remove_stopwords, stop_words):
    review_text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]', '', review)
    word_review = okt.morphs(review_text, stem=True)
    
    if remove_stopwords:
        revised_text = [w for w in word_review if len(w) > 1]
        word_review = [token for token in revised_text 
                       if not token in stop_words]
        
    return word_review

In [None]:
clean_train_review = []

i = 0
max = len(train_data['document'])
for review in train_data['document']:
    if (i % 1500 == 0):
        print('진행률= %d 퍼센트' % ((i/max * 100)+1))
    if type(review) == str:
        clean_train_review.append(preprocessing(review, True, stopwords))
    else:
        clean_train_review.append([])
    i += 1

In [None]:
clean_train_review[:5]

In [None]:
#len(clean_train_review)
len(train_data['document'])

In [None]:
test_file = DATA_IN_PATH + 'ratings_test.txt'
test_data = pd.read_csv(test_file, header=0, delimiter='\t', quoting=csv.QUOTE_NONE)
clean_test_review = []

i = 0
max = len(test_data['document'])
for review in test_data['document']:
    if i % 500 == 0:
        print('진행률= %d 퍼센트' % ((i/max * 100)+1))
        
    if type(review) == str:
        clean_test_review.append(preprocessing(review,True, stopwords))
    else:
        clean_test_review.append([])
    i += 1                  

In [None]:
len(clean_test_review)
#len(clean_test_review[:4])

tensorflow를 사용해 기계학습모델에 적용하기 위해서는 단어를 그대로 사용할 수 없으며,
텍스트 데이터인 단어를 수치 데이터로 변환해야 함.
따라서, text_to_sequences 라이브러리를 사용하여 전처리가 끝난 
train_review와 test_review의 각 벡터를 index로 구성된 벡터로 변환.
모든 index는 word_vocab에 저장되어 있음.

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_review)
train_sequence = tokenizer.texts_to_sequences(clean_train_review)
test_sequence = tokenizer.texts_to_sequences(clean_test_review)

word_vocab = tokenizer.word_index

In [None]:
print(clean_train_review[0])
print(train_sequence[0])
#print(word_vocab)
print(len(word_vocab))

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

각 벡터는 서로 길이가 다름. 이 길이를 하나로 통일해야 기계학습모델에 적용할 수 있음.
최대 길이(MAX_SEQUENCE_LENGTH=8)를 정하고, 이 길이보다 긴 벡터는 자르며,
이 길이보다 짧은 벡터는 빈 자리에 0을 추가(padding)한다.

In [None]:
MAX_SEQUNCE_LENGTH = 8

train_inputs = pad_sequences(train_sequence, maxlen=MAX_SEQUNCE_LENGTH, padding='post')
train_labels = np.array(train_data['label'])

In [None]:
print(train_inputs[:5])

In [None]:
test_inputs = pad_sequences(test_sequence, maxlen=MAX_SEQUNCE_LENGTH, padding='post')
test_labels = np.array(test_data['label'])

In [None]:
DATA_IN_PATH = './data_in/'
TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_LABEL_DATA = 'nsmc_train_label.npy'
TEST_INPUT_DATA = 'nsmc_test_input.npy'
TEST_LABEL_DATA = 'nsmc_test_label.npy'
DATA_CONFIGS = 'data_configs.json'

In [None]:
data_configs = {}

data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)+1

In [None]:
import os

if not os.path.exists(DATA_IN_PATH):
    ok.makedirs(DATA_IN_PATH)

In [None]:
np.save(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'wb'), train_inputs)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), train_labels)
np.save(open(DATA_IN_PATH + TEST_INPUT_DATA, 'wb'), test_inputs)
np.save(open(DATA_IN_PATH + TEST_LABEL_DATA, 'wb'), test_labels)

In [None]:
json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'), ensure_ascii=False)

In [None]:
import pandas as pd
TRAIN_CLEAN_DATA = 'train_clean.csv'
TEST_CLEAN_DATA = 'test_clean.csv'
clean_train_df = pd.DataFrame({'review':clean_train_review, 
                              'sentiment':train_data['label']})
clean_test_df = pd.DataFrame({'review':clean_test_review, 
                              'sentiment':test_data['label']})
clean_train_df.to_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA, index=False)
clean_test_df.to_csv(DATA_IN_PATH + TEST_CLEAN_DATA, index=False)