## Ex.04 자연어 처리 실습 1

**import**

In [1]:
import os
import glob
import re
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import nltk
print(1)

1


### 1. 데이터 읽어오기

In [2]:

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:10])

데이터 크기: 187088
Examples:
 ["Now I've heard there was a secret chord", 'That David played, and it pleased the Lord', "But you don't really care for music, do you?", 'It goes like this', 'The fourth, the fifth', 'The minor fall, the major lift', 'The baffled king composing Hallelujah Hallelujah', 'Hallelujah', 'Hallelujah', 'Hallelujah Your faith was strong but you needed proof']


### 2. 데이터 확인

In [3]:
# : 확인
a = filter(lambda x: x == ":" , raw_corpus)
print(list(a))

[]


In [4]:
# 공백 확인
cnt = []
count = 0
for sentence in raw_corpus:
    if len(sentence) == 0:
        count += 1
        cnt.append(sentence)
print(count)
cnt[:5]

11102


['', '', '', '', '']

In [5]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue
    if idx > 600: break   # 문장 600개만 확인.
        
    print(sentence)

Now I've heard there was a secret chord
That David played, and it pleased the Lord
But you don't really care for music, do you?
It goes like this
The fourth, the fifth
The minor fall, the major lift
The baffled king composing Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah Your faith was strong but you needed proof
You saw her bathing on the roof
Her beauty and the moonlight overthrew her
She tied you
To a kitchen chair
She broke your throne, and she cut your hair
And from your lips she drew the Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah You say I took the name in vain
I don't even know the name
But if I did, well really, what's it to you?
There's a blaze of light
In every word
It doesn't matter which you heard
The holy or the broken Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah I did my best, it wasn't much
I couldn't feel, so I tried to touch
I've told the truth, I didn't come to fool you
And even though
It all went wrong
I'll stand before the Lord of Song

In [6]:
b = list(filter(lambda x: x == 'Hallelujah', raw_corpus))
len(b)

33

In [41]:
# nltk 모듈 활용
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /aiffel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /aiffel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [38]:
# 불용어 확인
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(temp)

result = []
for w in word_tokens:
    if w not in stop_words:
        result.append(w)

* 데이터 정제 요소
    * 알파벳을 소문자로 통일
    * ''(공백) 문장 제거
    * (ring, ring, ring, ring) 와 같은 코러스 부분 의태어, 의성어 제거
    * 토큰의 개수가 15개를 초과하는 문장 제거 ( padding 조절차원)
    * 불용어 처리
    * 반복 문자 제거

### 3. 데이터 정제

In [22]:
# 데이터 정제 함수 선언
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()   # 소문자로 변경 및 양쪽 공백 제거
    sentence = re.sub(r'\([^)]*\)',"", sentence)    # ()와 안의 문자 제거
    # sentence = re.sub(r"([?\'.!,¿])", r" \1 ", sentence) # 특수 문자 양쪽에 공백 추가
    sentence = re.sub(r'[" "]+ ', " ", sentence)    # 여러개의 공백은 하나의 공백으로 치환
    sentence = re.sub(r"[^a-zA-Z?\'.!,¿]+", " ", sentence)    # a-zA-Z?.!,¿가 아닌 모든 문자를 하나의 공백으로 치환
    sentence = sentence.strip() # 다시 양쪽 공백 제거
    
    sentence = '<start> ' + sentence + ' <end>' # 문장 시작과 끝에 start, end 추가
    return sentence

# 필터링 체크
print(preprocess_sentence("This @_is ;;;sample  it's  (ring)     sentence."))

<start> this is sample it's sentence. <end>


In [31]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue   # ""인 문장 패스
    if len(sentence.split(' ')) > 15: continue    # 토큰 개수 16개 이상의 문장 패스
        
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
        

corpus[:15]

["<start> now i've heard there was a secret chord <end>",
 '<start> that david played, and it pleased the lord <end>',
 "<start> but you don't really care for music, do you? <end>",
 '<start> it goes like this <end>',
 '<start> the fourth, the fifth <end>',
 '<start> the minor fall, the major lift <end>',
 '<start> the baffled king composing hallelujah hallelujah <end>',
 '<start> hallelujah <end>',
 '<start> hallelujah <end>',
 '<start> hallelujah your faith was strong but you needed proof <end>',
 '<start> you saw her bathing on the roof <end>',
 '<start> her beauty and the moonlight overthrew her <end>',
 '<start> she tied you <end>',
 '<start> to a kitchen chair <end>',
 '<start> she broke your throne, and she cut your hair <end>']

**토큰화**

In [47]:
def tokenize(corpus):
    # 텐서플로우에서 제공하는 Tokenizer 패키지를 생성
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=13000,  # 전체 단어의 개수 
        filters=' ',    # 별도로 전처리 로직을 추가할 수 있음. 이번에는 사용하지 안함
        oov_token="<unk>"  # out-of-vocabulary, 사전에 없었던 단어는 <unk> 처리
    )
    tokenizer.fit_on_texts(corpus)   # 구축한 corpus로부터 Tokenizer가 사전을 자동구축

    # 이후 tokenizer를 활용하여 모델에 입력할 데이터셋을 구축
    tensor = tokenizer.texts_to_sequences(corpus)   # tokenizer는 구축한 사전으로부터 corpus를 해석해 Tensor로 변환.
   
    for num in tensor:
        if len(num) >= 29:
            tensor = np.delete(tensor, num)
            
    print(len(tensor))
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',maxlen=15)  

    print(tensor,tokenizer)
    return tensor, tokenizer

    
tensor, tokenizer = tokenize(corpus)

168528
[[   2   46  138 ...    0    0    0]
 [   2   14 3572 ...    0    0    0]
 [   2    4 6533 ...    0    0    0]
 ...
 [   2  259 3076 ...    0    0    0]
 [   2  143   18 ... 1182    3    0]
 [   2    6  404 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7ff967c810a0>


In [48]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]


print(src_input[0])
print(tgt_input[0])

[   2   46  138  303   84   44    8 1066 8568    3    0    0    0    0]
[  46  138  303   84   44    8 1066 8568    3    0    0    0    0    0]


### 4. 평가 데이터셋 분리

In [49]:
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input,
                                                          tgt_input,
                                                          train_size = 0.8)

In [50]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (134822, 14)
Target Train: (134822, 14)


: tokenize() 함수로 데이터를 tensor로 변환. 
<br/>
이후 훈련 데이터와 평가 데이터(20%)로 분리.
<br/>
단어장 크기 12,000이상이라 진행.

In [51]:
# 데이터셋 객체 생성

BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1   

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

## 5. 인공지능 만들기

In [52]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.drop  = tf.keras.layers.Dropout(0.5)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.drop(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [53]:
for src_sample, tgt_sample in dataset.take(1): break
model(src_sample)

<tf.Tensor: shape=(256, 14, 13001), dtype=float32, numpy=
array([[[-1.74784182e-05, -3.31470583e-05, -1.76763351e-04, ...,
          3.22089909e-05,  8.59614302e-05,  1.74894638e-04],
        [-1.58717176e-05, -2.86091254e-05, -6.27323054e-04, ...,
         -1.78841467e-04,  2.29909143e-04,  2.50941492e-04],
        [-1.57054325e-04, -3.42455714e-05, -5.39890199e-04, ...,
         -8.76109843e-05,  3.65030224e-04,  4.54130932e-04],
        ...,
        [ 1.97736663e-03, -8.72512581e-04,  2.72778794e-03, ...,
          7.49204133e-04,  4.33484267e-04, -1.68539671e-04],
        [ 2.32500443e-03, -8.95795005e-04,  3.08736414e-03, ...,
          9.77081712e-04,  4.39776020e-04, -1.85228346e-04],
        [ 2.63750041e-03, -9.13135125e-04,  3.38416966e-03, ...,
          1.18067150e-03,  4.42007702e-04, -1.91343119e-04]],

       [[-1.74784182e-05, -3.31470583e-05, -1.76763351e-04, ...,
          3.22089909e-05,  8.59614302e-05,  1.74894638e-04],
        [ 1.27662221e-04, -1.02395163e-04, -4

In [54]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3328256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  13326025  
Total params: 30,293,961
Trainable params: 30,293,961
Non-trainable params: 0
_________________________________________________________________


In [55]:
# 모델 학습

optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7ff9662fad00>

**모델 평가**

In [56]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환합니다
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # 단어 하나씩 예측해 문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다
    while True:
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor,
                                 tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [59]:
generate_text(model, tokenizer, init_sentence="<start> i love")

'<start> i love you <end> '

In [68]:
generate_text(model, tokenizer, init_sentence="<start> that")

"<start> that i don't wanna be alone <end> "

In [69]:
generate_text(model, tokenizer, init_sentence="<start> you've")

"<start> you've got a friend in me <end> "

In [70]:
generate_text(model, tokenizer, init_sentence="<start> i'd")

"<start> i'd like to be under the sea <end> "

In [71]:
generate_text(model, tokenizer, init_sentence="<start> hold on")

'<start> hold on to me and never let me go <end> '

In [72]:
generate_text(model, tokenizer, init_sentence="<start> keep")

"<start> keep on with the force don't stop <end> "

In [73]:
generate_text(model, tokenizer, init_sentence="<start> if you")

'<start> if you want to party <end> '

In [96]:
generate_text(model, tokenizer, init_sentence="<start> Hallelujah")

'<start> hallelujah <end> '

In [76]:
generate_text(model, tokenizer, init_sentence="<start> we")

'<start> we can make it easy if we lift each other <end> '

In [77]:
generate_text(model, tokenizer, init_sentence="<start> don't")

"<start> don't you know that you're toxic? <end> "

In [78]:
generate_text(model, tokenizer, init_sentence="<start> do not")

'<start> do not cry <end> '

In [81]:
generate_text(model, tokenizer, init_sentence="<start> where")

'<start> where the eyelids go <end> '

### 회고

In [83]:
loss = model.evaluate(enc_val,  dec_val)

print(round(loss,4))

1.2895


In [84]:
generate_text(model, tokenizer, init_sentence="<start> we")

'<start> we can make it easy if we lift each other <end> '

In [85]:
generate_text(model, tokenizer, init_sentence="<start> we've")

"<start> we've got a groovy thing <end> "

In [87]:
generate_text(model, tokenizer, init_sentence="<start> passing")

'<start> passing <unk> <end> '

In [89]:
generate_text(model, tokenizer, init_sentence="<start> on")

'<start> on the <unk> of the bay, <end> '

In [95]:
generate_text(model, tokenizer, init_sentence="<start> depending")

'<start> depending on how you mix that shit <end> '

: 먼저, los를 1.2895까지 낮춘 모델을 만들었으며,
<br/>we've와 같은 불용어도 삭제하지 않고, 어퍼스트로피만 남겨놨었는데 잘 인식하는 것 같고 <br/>
주어만 적어도 문장이 매끄럽게 완성돼서 전반적으로 놀라웠다.<br/>
동사 뒤 자주 딸려오는 전치사가 자연스럽게 매칭돼 이게 반복학습의 힘인가? 라는 생각을 하게 되기도 하였지만 문장 자체의 매끄러움이나 동명사나, 전치사만 주어졌을 때 모델이 인풋값을 제대로 인식하지 못하는 결함이 바로 발견되어 매우 아쉬웠다.
자연어처리 맛보기정도로 프로젝트를 진행했지만, 여태까지 진행했던 익스 중 가장 재밌었던거 같다.