## 1. 데이터 다운

Song Lyrics 데이터 다운

$ wget https://aiffelstaticprd.blob.core.windows.net/media/documents/song_lyrics.zip
$ unzip song_lyrics.zip -d ~/aiffel/lyricist/data/lyrics

## 2. 데이터 읽어오기

In [1]:
import glob
import os
import numpy as np
import tensorflow as tf
import re

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

# txt 파일을 모두 읽어서 raw_corpus 에 담기
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ['\ufeffbaby It was all a dream', 'I used to read Word Up magazine', 'Salt n Pepa and Heavy D up in the limousine']


In [2]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue   
    if sentence[-1] == ":": continue  

    if idx > 15: break   
        
    print(sentence)

﻿baby It was all a dream
I used to read Word Up magazine
Salt n Pepa and Heavy D up in the limousine
Hangin pictures on my wall
Every Saturday Rap Attack Mr Magic Marley Marl
I let my tape rock til my tape popped
Smokin weed and Bambu sippin on Private Stock
Way back when I had the red and black lumberjack
With the hat to match
Remember Rappin Duke duhha duhha
You never thought that hip hop would take it this far
Now Im in the limelight cause I rhyme tight
Time to get paid blow up like the World Trade
Born sinner the opposite of a winner
Remember when I used to eat sardines for dinner
Peace to Ron G Brucey B Kid Capri


## 3. 데이터 정제하기

In [3]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()       
  
    
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)                  
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)  
    sentence = sentence.strip()

    sentence = '<start> ' + sentence + ' <end>'  
    
    return sentence

print(preprocess_sentence("This @_####is ;;;^^^practice        sentence."))

<start> this is practice sentence . <end>


In [4]:
# 데이터 정제하기
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    # if len(processed.split()) <= 15: continue
        
    corpus.append(preprocess_sentence(sentence))
        
corpus[:10]

['<start> baby it was all a dream <end>',
 '<start> i used to read word up magazine <end>',
 '<start> salt n pepa and heavy d up in the limousine <end>',
 '<start> hangin pictures on my wall <end>',
 '<start> every saturday rap attack mr magic marley marl <end>',
 '<start> i let my tape rock til my tape popped <end>',
 '<start> smokin weed and bambu sippin on private stock <end>',
 '<start> way back when i had the red and black lumberjack <end>',
 '<start> with the hat to match <end>',
 '<start> remember rappin duke duhha duhha <end>']

In [5]:
# 토큰 개수가 15개를 넘어가는 문장 제외시키기
processed = []

for i in corpus:
    if len(i.split()) <= 15:
        processed.append(i)
        
print (len(processed))

156013


In [6]:
# 데이터를 tensor로 변환
def tokenize(corpus):
    # 텐서플로우에서 제공하는 Tokenizer 패키지 생성
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,  # 전체 단어갯수 
        filters=' ',    
        oov_token="<unk>"  # out-of-vocabulary, 사전에 없었던 단어는 어떤 토큰으로 대체할지 정함
    )
    tokenizer.fit_on_texts(processed)   # 구축한 corpus로부터 Tokenizer가 사전 자동구축

    # tokenizer를 활용, 모델에 입력할 데이터셋 구축
    tensor = tokenizer.texts_to_sequences(processed)   # 구축한 사전으로부터 corpus를 해석해 Tensor로 변환

    # 시퀀스 길이를 일정하게 맞추기 위한 padding  메소드를 제공
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen = 15)  

    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(processed)

[[   2   52   11 ...    0    0    0]
 [   2    4  285 ...    0    0    0]
 [   2 2876  480 ...    0    0    0]
 ...
 [   2    6  460 ...   26  205    3]
 [   2    8   42 ...    0    0    0]
 [   2    4   92 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7fd59afe9050>


In [7]:
# 구축된 단어사전 확인
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 25: break

1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : ,
6 : the
7 : you
8 : and
9 : a
10 : to
11 : it
12 : me
13 : my
14 : in
15 : that
16 : t
17 : s
18 : on
19 : your
20 : of
21 : we
22 : .
23 : like
24 : m
25 : all


## 4. 평가 데이터셋 분리

In [8]:
# 소스와 타깃으로 분리
src_input = tensor[:, :-1]  # tensor에서 마지막 토큰을 잘라내 소스 문장 생성. 마지막 토큰은 <end>가 아니라 <pad>일 가능성이 높다.
tgt_input = tensor[:, 1:]    # tensor에서 <start>를 잘라내 타겟 문장 생성

print(src_input[0])
print(tgt_input[0])

[  2  52  11  53  25   9 361   3   0   0   0   0   0   0]
[ 52  11  53  25   9 361   3   0   0   0   0   0   0   0]


In [9]:
txt_data = src_input
txt_label = tgt_input

import nltk
with open(txt_data) as fp:
    tokenized_text = [word for word in nltk.tokenize.word_tokenize(fp.read()) if len(word) <= 15]

In [10]:
# train, test 데이터 분리
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(txt_data,
                                                          txt_label,
                                                          test_size=0.2,
                                                          random_state=7)

print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (124810, 14)
Target Train: (124810, 14)


In [11]:
# 데이터셋 객체 생성
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1    # tokenizer가 구축한 단어사전 내 7000개와, 여기 포함되지 않은 0:<pad>를 포함하여 7001개

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

## 5. 인공지능 만들기

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Activation

class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [13]:
for src_sample, tgt_sample in dataset.take(1): break
model(src_sample)

<tf.Tensor: shape=(256, 14, 12001), dtype=float32, numpy=
array([[[-8.70321965e-05,  1.62195458e-04, -9.12767136e-05, ...,
          2.96564132e-04,  9.78723328e-05, -1.37092429e-04],
        [-1.09071538e-04,  2.83006229e-04, -4.64774203e-04, ...,
          4.25667909e-04,  1.80758187e-04, -2.60813191e-04],
        [-5.11544276e-05,  4.37017297e-04, -7.10082822e-04, ...,
          5.97222417e-04,  8.68868228e-05, -2.75154714e-04],
        ...,
        [-5.40219422e-04, -5.66189818e-04,  3.86095257e-04, ...,
         -3.23659205e-03, -1.98685168e-03, -1.24699273e-03],
        [-5.71329263e-04, -5.89624222e-04,  4.55118163e-04, ...,
         -3.60888918e-03, -2.06034631e-03, -1.34392164e-03],
        [-5.88929048e-04, -6.04098605e-04,  4.98374575e-04, ...,
         -3.94509081e-03, -2.10824842e-03, -1.41808251e-03]],

       [[-8.70321965e-05,  1.62195458e-04, -9.12767136e-05, ...,
          2.96564132e-04,  9.78723328e-05, -1.37092429e-04],
        [-1.59556599e-04,  4.60096373e-04, -7

In [14]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3072256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________


In [15]:
# 모델 학습시키기
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd597f7fc10>

In [16]:
# 모델 평가
results = model.evaluate(enc_val, dec_val, verbose=2)

print(results)

976/976 - 10s - loss: 2.0205
2.0204739570617676


In [17]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
   
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

     
    while True:
        predict = model(test_tensor)   
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]    

         
        test_tensor = tf.concat([test_tensor, 
																 tf.expand_dims(predict_word, axis=0)], axis=-1)

        
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated   

In [18]:
generate_text(model, tokenizer, init_sentence="<start> i", max_len=20)

'<start> i m the one that s gon hold you down <end> '