# read data

In [11]:
# glob can is used to load files easily
import glob
import os
txt_file_path = "song_lyrics/*"
txt_lst = glob.glob(txt_file_path)
print(len(txt_lst))
print(txt_lst[:10])

49
['song_lyrics/britney-spears.txt', 'song_lyrics/bieber.txt', 'song_lyrics/lin-manuel-miranda.txt', 'song_lyrics/bjork.txt', 'song_lyrics/jimi-hendrix.txt', 'song_lyrics/blink-182.txt', 'song_lyrics/amy-winehouse.txt', 'song_lyrics/radiohead.txt', 'song_lyrics/r-kelly.txt', 'song_lyrics/paul-simon.txt']


In [2]:
raw_corpus = []

# read all txt_lst
for txt_file in txt_lst:
    #open text file, read, and then add into raw_corpus
    with open(txt_file, "r") as f:
        #read().splitlines() : read all lines and split it by \n
        try:
            raw = f.read().splitlines()
            #add all lines into list seperately
            raw_corpus.extend(raw)
        except UnicodeDecodeError as e:
            print("current txt_file : ", txt_file)
            print(e)
            

print("length of raw_corpus : ", len(raw_corpus))
print("examples : \n",raw_corpus[:2])

length of raw_corpus :  187088
examples : 
 ['They say get ready for the revolution', "I think it's time we find some sorta solution"]


# data preprocessing
- basic : delete duplicates and null
- natural language processing : replacing special charater/blank, tokenization, word 2 index

In [3]:
import re
import random

def preprocess_sentence(raw_sentence):
    low_sentence = raw_sentence.lower().strip()
    special_space_sentence = re.sub(r"([?.!,¿])",r"\1 ", low_sentence)
    empty_space_sentence = re.sub(r'[" "]+', " ", special_space_sentence)
    all_space_sentence = re.sub(r"[^a-zA-Z.!,¿]+"," ", empty_space_sentence)
    striped_sentence = all_space_sentence.strip()
    if len(striped_sentence) == 0:
        return None
    sentence = "<start> " + striped_sentence + " <end>"
    return sentence


idx = int(random.random() * 100)
selected_sentence = raw_corpus[idx]
preprocessed_sentence = preprocess_sentence(selected_sentence)
print("### comparision raw sentence and preprocessed sentence ###")
print("selected sentence : ", selected_sentence)
print("after preprocessing : ", preprocessed_sentence)

### comparision raw sentence and preprocessed sentence ###
selected sentence :  Settle up and get your rhythm
after preprocessing :  <start> settle up and get your rhythm <end>


In [4]:
corpus = []
for sentence in raw_corpus:
    # if sentence is empty, pass
    if len(sentence) == 0: continue
    preprocessed = preprocess_sentence(sentence)
    if preprocessed != None:
        corpus.append(preprocess_sentence(sentence))

print(len(corpus))
print("### print 5 preprocessed sentences ##")
print(corpus[:5])

175920
### print 5 preprocessed sentences ##
['<start> they say get ready for the revolution <end>', '<start> i think it s time we find some sorta solution <end>', '<start> somebody s caught up in the endless pollution <end>', '<start> they need to wake up, stop living illusions i know you need to hear this <end>', '<start> why won t somebody feel this <end>']


In [5]:
import tensorflow as tf

def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,
        filters='',
        oov_token="<unk>"
    )
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    # token이 너무 크면 공백이 많아지므로 최대 길이를 15로 지정
    # 마지막 단어가 출력에 가까운게 좋으므로 앞에 패딩을 뭍임
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=15,padding="pre")
    print("tensor.shape : ", tensor.shape)
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

tensor.shape :  (175920, 15)
[[   0    0    0 ...    5 2487    3]
 [   0    0    0 ... 4606 7411    3]
 [   0    0    0 ... 3468    1    3]
 ...
 [   0    0    0 ...    6  880    3]
 [   9 6293  395 ...   18 1007    3]
 [   0    0    0 ...    4  804    3]] <keras_preprocessing.text.Tokenizer object at 0x7fd55015ed10>


In [6]:
# print tensor and tokenizer's values
print(tensor[:3, :])
print(type(tokenizer.index_word))
for idx in tokenizer.index_word:
    print(idx, " : ", tokenizer.index_word[idx])
    if idx >= 5: break

[[   0    0    0    0    0    0    2   40   69   39  325   25    5 2487
     3]
 [   0    0    0    2    4  122   10   15   74   21  195   93 4606 7411
     3]
 [   0    0    0    0    0    2  267   15  623   33   13    5 3468    1
     3]]
<class 'dict'>
1  :  <unk>
2  :  <start>
3  :  <end>
4  :  i
5  :  the


# dataset split and preparation for learning

In [7]:
from sklearn.model_selection import train_test_split
"""
preparing dataset
- x : idx 0 ~ -2
- y : idx 1 ~ -1

"""
X = tensor[:,:-1]
y = tensor[:,1:]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train.shape : ", X_train.shape)
print("X_val.shape : ", X_val.shape)
print("y_train.shape : ", y_train.shape)
print("y_val.shape : ", y_val.shape)


training_size = len(X_train)
valiation_size = len(X_val)
batch_size = 256
steps_per_epochs = training_size//batch_size

def get_dataset(source_input, target_input):
    buffer_size = len(source_input)
    # number of words + 1(<pad> is not included in tokenizer)
    vocab_size = tokenizer.num_words + 1
    dataset = tf.data.Dataset.from_tensor_slices((source_input, target_input)).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset


train_dataset = get_dataset(X_train, y_train) 
val_dataset = get_dataset(X_val, y_val)

del raw_corpus, corpus, tensor, X,\
        y, X_train, X_val, y_train, y_val

X_train.shape :  (140736, 14)
X_val.shape :  (35184, 14)
y_train.shape :  (140736, 14)
y_val.shape :  (35184, 14)


In [8]:
print(train_dataset.take(1))
print(val_dataset.take(1))
for sources, targets in train_dataset.take(1):
    print(sources)
    print(targets)

cnt = 0
for x in train_dataset:
    cnt +=1
print(cnt)
print(training_size//batch_size)

<TakeDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>
<TakeDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>
tf.Tensor(
[[  0   0   0 ...  10  68   6]
 [  0   0   0 ...  13   5 357]
 [  0   0   0 ...  43 793 782]
 ...
 [  0   2   4 ...  33  17   6]
 [  0   0   0 ...   2   5 820]
 [  0   0   0 ...  13   5 411]], shape=(256, 14), dtype=int32)
tf.Tensor(
[[  0   0   0 ...  68   6   3]
 [  0   0   0 ...   5 357   3]
 [  0   0   0 ... 793 782   3]
 ...
 [  2   4 195 ...  17   6   3]
 [  0   0   0 ...   5 820   3]
 [  0   0   0 ...   5 411   3]], shape=(256, 14), dtype=int32)
549
549


# design dnn model and fit

In [9]:
tf.keras.backend.clear_session()

class LyricGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(LyricGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        #return_sequences means return same length of seqences with input's length
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
    
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        return out
    
embedding_size = 128
hidden_size = 128
model = LyricGenerator(tokenizer.num_words+1, embedding_size, hidden_size)

In [15]:
epochs = 3

optimizer = tf.keras.optimizers.Adam(1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True,
        reduction="none"
    )

model.compile(loss=loss,
              optimizer=optimizer,
             metrics=["accuracy"])


checkpoint_dir = os.getenv("HOME")+"/github/aiffel_practice/EXPLORATION06/checkpoint/lyric_model"
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_dir,
    save_weights_only=True,
    monitor="val_loss",
    save_best_only=True,
    mode="auto",
    verbose=1
)

history = model.fit(train_dataset, epochs=epochs,
        steps_per_epoch= training_size//batch_size,
        validation_data=val_dataset,
        validation_steps=valiation_size//batch_size,
        callbacks=[cp_callback])

Epoch 1/3
Epoch 00001: val_loss improved from inf to 5.98853, saving model to /home/aiffel/github/aiffel_practice/EXPLORATION06/checkpoint/lyric_model
Epoch 2/3
Epoch 00002: val_loss improved from 5.98853 to 5.54380, saving model to /home/aiffel/github/aiffel_practice/EXPLORATION06/checkpoint/lyric_model
Epoch 3/3
Epoch 00003: val_loss improved from 5.54380 to 5.24022, saving model to /home/aiffel/github/aiffel_practice/EXPLORATION06/checkpoint/lyric_model


# generate lyrics test

In [20]:
def generate_lyrics(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    while True:
        predict = model(test_tensor)
        predicted_word = tf.argmax(tf.nn.softmax(predict,axis = -1), axis=-1)[:, -1]
        test_tensor = tf.concat([test_tensor,
                                tf.expand_dims(predicted_word, axis=0)], axis=-1)
        if predicted_word.numpy()[0] == end_token:break
        if test_tensor.shape[1] >= max_len:break
        
    generated = ""
    print(test_tensor)
    for word_index in test_tensor[0].numpy():
        print(word_index, generated)
        generated += tokenizer.index_word[word_index] +" "
    return generated

generate_lyrics(model, tokenizer, init_sentence="<start> I want")

tf.Tensor([[ 2  4 56  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]], shape=(1, 20), dtype=int64)
2 
4 <start> 
56 <start> i 
0 <start> i want 


KeyError: 0

In [None]:
model.load_weights(checkpoint_dir)

In [None]:
!pwd
