# read data

In [1]:
# glob can is used to load files easily
import glob
txt_file_path = "song_lyrics/*"
txt_lst = glob.glob(txt_file_path)
print(len(txt_lst))
print(txt_lst[:10])

49
['song_lyrics/britney-spears.txt', 'song_lyrics/bieber.txt', 'song_lyrics/lin-manuel-miranda.txt', 'song_lyrics/bjork.txt', 'song_lyrics/jimi-hendrix.txt', 'song_lyrics/blink-182.txt', 'song_lyrics/amy-winehouse.txt', 'song_lyrics/radiohead.txt', 'song_lyrics/r-kelly.txt', 'song_lyrics/paul-simon.txt']


In [2]:
raw_corpus = []

# read all txt_lst
for txt_file in txt_lst:
    #open text file, read, and then add into raw_corpus
    with open(txt_file, "r") as f:
        #read().splitlines() : read all lines and split it by \n
        raw = f.read().splitlines()
        #add all lines into list seperately
        raw_corpus.extend(raw)

print("length of raw_corpus : ", len(raw_corpus))
print("examples : \n",raw_corpus[:2])

length of raw_corpus :  187088
examples : 
 ['They say get ready for the revolution', "I think it's time we find some sorta solution"]


# data preprocessing
- basic : delete duplicates and null
- natural language processing : replacing special charater/blank, tokenization, word 2 index

In [3]:
import re
import random

def preprocess_sentence(raw_sentence):
    low_sentence = raw_sentence.lower().strip()
    special_space_sentence = re.sub(r"([?.!,¿])",r"\1 ", low_sentence)
    empty_space_sentence = re.sub(r'[" "]+', " ", special_space_sentence)
    all_space_sentence = re.sub(r"[^a-zA-Z.!,¿]+"," ", empty_space_sentence)
    striped_sentence = all_space_sentence.strip()
    sentence = "<start>" + striped_sentence + "<end>"
    return sentence


idx = int(random.random() * 100)
selected_sentence = raw_corpus[idx]
preprocessed_sentence = preprocess_sentence(selected_sentence)
print("### comparision raw sentence and preprocessed sentence ###")
print("selected sentence : ", selected_sentence)
print("after preprocessing : ", preprocessed_sentence)

### comparision raw sentence and preprocessed sentence ###
selected sentence :  It's like a competition
after preprocessing :  <start>it s like a competition<end>


In [4]:
corpus = []
for sentence in raw_corpus:
    # if sentence is empty, pass
    if len(sentence) == 0: continue
    corpus.append(preprocess_sentence(sentence))

print("### print 5 preprocessed sentences ##")
print(corpus[:5])

### print 5 preprocessed sentences ##
['<start>they say get ready for the revolution<end>', '<start>i think it s time we find some sorta solution<end>', '<start>somebody s caught up in the endless pollution<end>', '<start>they need to wake up, stop living illusions i know you need to hear this<end>', '<start>why won t somebody feel this<end>']


In [9]:
import tensorflow as tf

def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=10000,
        filters='',
        oov_token="<unk>"
    )
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    # token이 너무 크면 공백이 많아지므로 최대 길이를 15로 지정
    # 마지막 단어가 출력에 가까운게 좋으므로 앞에 패딩을 뭍임
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=15,padding="pre")
    print("tensor.shape : ", tensor.shape)
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

tensor.shape :  (175986, 15)
[[   0    0    0 ...   26    2 8032]
 [   0    0    0 ...  100 7014 9457]
 [   0    0    0 ...    2 4331    1]
 ...
 [   0    0    0 ...  423    4 3485]
 [1820    5 9215 ...  446   17 1745]
 [   0    0    0 ...  149    3 1895]] <keras_preprocessing.text.Tokenizer object at 0x7f732af3a410>


In [16]:
# print tensor and tokenizer's values
print(tensor[:3, :])
print(type(tokenizer.index_word))
for idx in tokenizer.index_word:
    print(idx, " : ", tokenizer.index_word[idx])
    if idx >= 5: break

[[   0    0    0    0    0    0    0    0  110   74   34  465   26    2
  8032]
 [   0    0    0    0    0    7  123   11   12   94   24  217  100 7014
  9457]
 [   0    0    0    0    0    0    0 1113   12  811   35   13    2 4331
     1]]
<class 'dict'>
1  :  <unk>
2  :  the
3  :  i
4  :  you
5  :  a


In [18]:
"""
preparing dataset
- source input : idx 0 ~ -2
- target input : idx 1 ~ -1

"""
source_input = tensor[:,:-1]
target_input = tensor[:,1:]
print(source_input[0])
print(target_input[0])


buffer_size = len(source_input)
batch_size = 256
steps_per_epochs = buffer_size//batch_size
# number of words + 1(<pad> is not included in tokenizer)
vocab_size = tokenizer.num_words + 1
dataset = tf.data.Dataset.from_tensor_slices((source_input, target_input)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

del raw_corpus, corpus, tensor, source_input, target_input
print(dataset)

[  0   0   0   0   0   0   0   0 110  74  34 465  26   2]
[   0    0    0    0    0    0    0  110   74   34  465   26    2 8032]
<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>
