# 3. 프로젝트 진행

In [111]:
import glob  
import tensorflow as tf
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split

print(tf.__version__)

2.6.0


In [112]:
txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path) 

raw_corpus = [] 

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines() 
        raw_corpus.extend(raw)
        
print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:100])

데이터 크기: 187088
Examples:
 ['', '', '[Spoken Intro:]', 'You ever want something ', "that you know you shouldn't have ", "The more you know you shouldn't have it, ", 'The more you want it ', 'And then one day you get it, ', "It's so good too ", "But it's just like my girl ", "When she's around me ", 'I just feel so good, so good ', 'But right now I just feel cold, so cold ', 'Right down to my bones ', "'Cause ooh... ", "Ain't no sunshine when she's gone ", "It's not warm when she's away ", "Ain't no sunshine when she's gone ", "And she's always gone too long ", 'Anytime she goes away ', '', "Wonder this time where she's gone ", "Wonder if she's gone to stay ", "Ain't no sunshine when she's gone ", "And this house just ain't no home ", 'Anytime she goes away ', '', 'I know, I know, I know, I know, ', 'I know, know, know, know, know, ', 'I know, I know, ', 'Hey I ought to leave ', 'I ought to leave her alone ', "Ain't no sunshine when she's gone ", '', "Ain't no sunshine when she's gone ",

In [113]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r'[^a-zA-Z?.!,¿]+', " ",sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [114]:
raw_corpus[:100]

['',
 '',
 '[Spoken Intro:]',
 'You ever want something ',
 "that you know you shouldn't have ",
 "The more you know you shouldn't have it, ",
 'The more you want it ',
 'And then one day you get it, ',
 "It's so good too ",
 "But it's just like my girl ",
 "When she's around me ",
 'I just feel so good, so good ',
 'But right now I just feel cold, so cold ',
 'Right down to my bones ',
 "'Cause ooh... ",
 "Ain't no sunshine when she's gone ",
 "It's not warm when she's away ",
 "Ain't no sunshine when she's gone ",
 "And she's always gone too long ",
 'Anytime she goes away ',
 '',
 "Wonder this time where she's gone ",
 "Wonder if she's gone to stay ",
 "Ain't no sunshine when she's gone ",
 "And this house just ain't no home ",
 'Anytime she goes away ',
 '',
 'I know, I know, I know, I know, ',
 'I know, know, know, know, know, ',
 'I know, I know, ',
 'Hey I ought to leave ',
 'I ought to leave her alone ',
 "Ain't no sunshine when she's gone ",
 '',
 "Ain't no sunshine when she's

In [115]:
corpus = []

for sentence in raw_corpus:
    if len(sentence)  == 0: continue
    if sentence[-1] == ":": continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
    
print(len(corpus), corpus[:3])

175749 ['<start> spoken intro <end>', '<start> you ever want something <end>', '<start> that you know you shouldn t have <end>']


In [116]:
corpus = []

for sentence in raw_corpus:
    if len(sentence)  == 0: continue
    if sentence[-1] == ":": continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    if len(preprocessed_sentence.split()) <= 15:
        corpus.append(preprocessed_sentence)
    
print(len(corpus), corpus[:3])

156013 ['<start> spoken intro <end>', '<start> you ever want something <end>', '<start> that you know you shouldn t have <end>']


In [117]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=12000, 
    filters=' ', 
    oov_token='<unk>'
)

tokenizer.fit_on_texts(corpus)

In [119]:
def tokenize(corpus):
    
    tensor = tokenizer.texts_to_sequences(corpus)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=15)

    return tensor

In [143]:
x,y = train_test_split(corpus, test_size=0.2)

In [146]:
x_tensor = tokenize(x)

In [147]:
y_tensor = tokenize(y)

In [148]:
x_tensor.shape

(124810, 15)

In [149]:
y_tensor.shape

(31203, 15)

In [150]:
print(x_tensor.shape)
x_src_input = x_tensor[:, :-1]  
print(x_tensor[:,:-1].shape)
x_tgt_input = x_tensor[:, 1:]
print(x_tensor[:,1:].shape)

(124810, 15)
(124810, 14)
(124810, 14)


In [151]:
print(y_tensor.shape)
y_src_input = y_tensor[:, :-1]  
print(y_tensor[:,:-1].shape)
y_tgt_input = y_tensor[:, 1:]
print(y_tensor[:,1:].shape)

(31203, 15)
(31203, 14)
(31203, 14)


In [152]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    
    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : ,
6 : the
7 : you
8 : and
9 : a
10 : to


In [153]:
BUFFER_SIZE = len(x_src_input)
print(BUFFER_SIZE)
BATCH_SIZE = 256
steps_per_epoch = len(x_src_input) // BATCH_SIZE
print(steps_per_epoch)

VOCAB_SIZE = tokenizer.num_words + 1

124810
487


In [154]:
BUFFER_SIZE = len(y_src_input)
print(BUFFER_SIZE)
BATCH_SIZE = 256
steps_per_epoch = len(y_src_input) // BATCH_SIZE
print(steps_per_epoch)

VOCAB_SIZE = tokenizer.num_words + 1

31203
121


In [155]:
print(x_src_input.shape)
print(x_tgt_input.shape)

(124810, 14)
(124810, 14)


In [156]:
print(y_src_input.shape)
print(y_tgt_input.shape)

(31203, 14)
(31203, 14)


In [157]:
x_dataset = tf.data.Dataset.from_tensor_slices((x_src_input, x_tgt_input))
x_dataset = x_dataset.shuffle(BUFFER_SIZE)
x_dataset = x_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [158]:
x_dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [159]:
y_dataset = tf.data.Dataset.from_tensor_slices((y_src_input, y_tgt_input))
y_dataset = y_dataset.shuffle(BUFFER_SIZE)
y_dataset = y_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [160]:
y_dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [161]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

In [162]:
embedding_size = 256
hidden_size = 1024
VOCAB_SIZE = tokenizer.num_words + 1
#단어장 사이즈 = 12001
model = TextGenerator(tokenizer.num_words + 1, embedding_size, hidden_size)

In [163]:
x_dataset.take(1)

<TakeDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [164]:
y_dataset.take(1)

<TakeDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [165]:
for src_sample, tgt_sample in x_dataset.take(1):
    print(src_sample)
    print(tgt_sample)
    break

tf.Tensor(
[[    2    33    50 ...     0     0     0]
 [    2   107  1864 ...     0     0     0]
 [    2    25    41 ...     0     0     0]
 ...
 [    2   577 11551 ...     0     0     0]
 [    2    39   685 ...     0     0     0]
 [    2  2539  6365 ...     0     0     0]], shape=(256, 14), dtype=int32)
tf.Tensor(
[[   33    50    11 ...     0     0     0]
 [  107  1864   221 ...     0     0     0]
 [   25    41   870 ...     0     0     0]
 ...
 [  577 11551     8 ...     0     0     0]
 [   39   685  6900 ...     0     0     0]
 [ 2539  6365    43 ...     0     0     0]], shape=(256, 14), dtype=int32)


In [166]:
model(src_sample)

<tf.Tensor: shape=(256, 14, 12001), dtype=float32, numpy=
array([[[-2.58650369e-04, -9.09738010e-05, -6.95002454e-05, ...,
         -2.72757461e-05,  4.43973171e-04,  2.72697420e-04],
        [-2.13396779e-04,  1.00372570e-04, -1.39802505e-04, ...,
          9.70447945e-05,  1.03016198e-03,  3.25026253e-04],
        [-1.03921251e-04,  5.08677447e-04, -1.18027987e-04, ...,
          2.87765724e-04,  1.12573034e-03,  1.81514915e-04],
        ...,
        [-2.06596145e-04,  1.89160801e-05,  6.19022117e-04, ...,
         -1.29212067e-03, -3.06078553e-04, -2.13991661e-04],
        [ 2.56407919e-04, -6.62493258e-05,  8.30457197e-04, ...,
         -1.68347661e-03, -5.19126246e-04, -6.58018325e-05],
        [ 7.24128040e-04, -1.53543660e-04,  1.06339168e-03, ...,
         -2.03096052e-03, -6.83442806e-04,  4.36616901e-05]],

       [[-2.58650369e-04, -9.09738010e-05, -6.95002454e-05, ...,
         -2.72757461e-05,  4.43973171e-04,  2.72697420e-04],
        [-2.62016983e-04, -3.47299385e-04, -6

In [167]:
model.summary()

Model: "text_generator_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      multiple                  3072256   
_________________________________________________________________
lstm_8 (LSTM)                multiple                  5246976   
_________________________________________________________________
lstm_9 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense_4 (Dense)              multiple                  12301025  
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________


In [168]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none')

model.compile(loss=loss, optimizer=optimizer)

In [169]:
model.fit(x_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f24c05b4f40>

In [170]:
embedding_size = 256
hidden_size = 1024
VOCAB_SIZE = tokenizer.num_words + 1
#단어장 사이즈 = 12001
model = TextGenerator(tokenizer.num_words + 1, embedding_size, hidden_size)

In [171]:
def generate_text(model, tokenizer, init_sentence = "<start>", max_len = 15):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    while True:
        predict = model(test_tensor)
        predict_word = tf.argmax(tf.nn.softmax(predict, axis = -1), axis = -1)[:, -1]
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""

    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated 

In [172]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len = 15)

'<start> i love millionaire clappin clappin dumbing chip raspberry act gling gling beckons beckons beckons '

# 4. 회고

4.1 RNN 모델은 Layer의 역할과 shape 변화 추적이 관건

    * Simple RNN과 LSTM 모두 Embedding Layer와 RNN Layer, Linear Dense layer 이렇게 세 가지로 구분된다. Simple과 LSTM이 구분되는 것은 RNN Layer의 구조이다. 그러나 RNN Layer의 세부 사항이 다른 걸 제외한다면, Embedding Layer는 문자를 token으로 전환, RNN Layer는 단어 간 연관성 조사, Dense Layer는 단어장 사이즈로 분류 이렇게 큰 틀에선 동일하다.  

4.2 가중치가 변하지 않는다는 특징에 유념할 것.
    
    * RNN 구조의 특징은 아무리 긴 문장을 학습하더라도 모두 동일한 Weigh를 사용한다는 점이다. 이렇게 설계된 이유는 문장 길이에 구애받지 않고 모델이 학습할 수 있는 유연한 환경을 조성하기 위해서라고 추측할 수 있다. 

4.3 LSTM에서 parameter 계산법

    * LSTM은 forget gate에서 weight가 1종류, input gate에서 weight가 2종류 cell state에서 weight가 1 종류가 편성된다. 그래서 simple rnn에 비해 parameter의 개수가 훨씬 많아지는 것이다. 이를 계산하는 법을 따로 필기해 저장해놓았는데 덕분에 LSTM 이해를 수월히 할 수 있었다. 