In [1]:
import re
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

2024-04-13 01:27:16.348121: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
txt_file = 'data/eng_elec_text.txt'
raw_corpus = []

with open(txt_file, 'r', encoding = 'utf-8') as f:
    raw = f.read().splitlines()
    raw_corpus.extend(raw)
print(len(raw_corpus))

142505


In [7]:
#txt_file = 'data/eng_folk_text.txt'
#txt_file = 'data/eng_hiphop_text.txt'
#txt_file = 'data/eng_pop_text.txt'
#txt_file = 'data/eng_rnb_text.txt'
txt_file = 'data/eng_rock_text.txt'
with open(txt_file, 'r', encoding = 'utf-8') as f:
    raw = f.read().splitlines()
    raw_corpus.extend(raw)
print(len(raw_corpus))

1186874


In [8]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()                         #소문자 변경 후 양쪽 공백 제거
    sentence = re.sub(r"([?.!,¿]).,", r" \1 ", sentence)          #특수문자 양쪽에 공백 추가
    sentence = re.sub(r'[" "]+', " ", sentence)                 #여러 개의 공백이 붙어있으면 하나의 공백으로
    sentence = re.sub(r"[^a-zA-Z가-힣?!¿.,]+", " ", sentence)   #영어, 알파벳, ?, !, ¿ 제외 모두 공백으로
    sentence = sentence.strip()                                 #양쪽 공백 제거
    sentence = re.sub(r"\(.\)", " ", sentence)                  #괄호 제거
    sentence = '<start> ' + sentence + ' <end>'                 #start, end 추가
    return sentence

In [9]:
corpus = []
for sentence in raw_corpus:
    if len(sentence) == 0:
        continue
    if sentence[-1] == ':':
        continue
    if len(sentence)>150:
        continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
    
print(len(corpus))
print(corpus[:10])

1132836
['<start> ooh i can t pretend <end>', '<start> like you didn t bring <end>', '<start> my tempo up again <end>', '<start> tempo up again <end>', '<start> my head s in a spin <end>', '<start> you send my body to a place <end>', '<start> it s never been <end>', '<start> baby won t you let me <end>', '<start> keep you up all night <end>', '<start> let the morning come closer <end>']


In [10]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 25000, filters = ' ', oov_token = '<unk>')
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
    print('토크나이저: ', tokenizer, '\n', tensor)
    
    return tensor, tokenizer

In [11]:
tensor, tokenizer = tokenize(corpus)

토크나이저:  <keras.src.legacy.preprocessing.text.Tokenizer object at 0x7f958958a3d0> 
 [[   2   66    4 ...    0    0    0]
 [   2   23    5 ...    0    0    0]
 [   2   13 3247 ...    0    0    0]
 ...
 [   2   13 5559 ...    0    0    0]
 [   2   13   13 ...    0    0    0]
 [   2 5559    3 ...    0    0    0]]


In [12]:
tensor.shape

(1132836, 36)

In [13]:
for idx in tokenizer.index_word:
    print(idx, ':', tokenizer.index_word[idx])
    if idx>=20: break

1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : you
6 : the
7 : it
8 : to
9 : me
10 : t
11 : and
12 : a
13 : my
14 : s
15 : m
16 : that
17 : in
18 : we
19 : on
20 : your


In [14]:
#소스 문장과 타겟 문장으로 1차 분리
#tensor에서 마지막 토큰을 잘라내어 소스 문장을, 첫번째 start를 잘라내어 타겟 문장을 생성
#LSTM에서 many-to-many의 답을 얻을 것이므로 위와 같이 구성 <- ????
#이 과정을 통해 src_input과 tgt_input 길이 동일
src_input = tensor[:, :-1]      #start+sentence+end+padding n-1개
tgt_input = tensor[:, 1:]       #sentence+end+padding n개
print('텐서 길이: ', tensor.shape)
print('소스문장 길이: ', len(src_input[0]))
print('타겟문장 길이: ', len(tgt_input[0]))

텐서 길이:  (1132836, 36)
소스문장 길이:  35
타겟문장 길이:  35


In [15]:
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size = 0.2, random_state = 1234)
print('Source Train 길이: ', enc_train.shape)
print('Target Train 길이: ', dec_train.shape)
print('Source Test 길이: ', enc_val.shape)
print('Target Test 길이: ', dec_val.shape)

Source Train 길이:  (906268, 35)
Target Train 길이:  (906268, 35)
Source Test 길이:  (226568, 35)
Target Test 길이:  (226568, 35)


In [16]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epochs = len(src_input) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1

dataset_train = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset_train = dataset_train.shuffle(BUFFER_SIZE)
dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder = True)

dataset_val = tf.data.Dataset.from_tensor_slices((enc_val, dec_val))
dataset_val = dataset_val.shuffle(BUFFER_SIZE)
dataset_val = dataset_val.batch(BATCH_SIZE, drop_remainder = True)

print(dataset_train)
print(dataset_val)

2024-04-13 01:28:03.274497: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


<_BatchDataset element_spec=(TensorSpec(shape=(256, 35), dtype=tf.int32, name=None), TensorSpec(shape=(256, 35), dtype=tf.int32, name=None))>
<_BatchDataset element_spec=(TensorSpec(shape=(256, 35), dtype=tf.int32, name=None), TensorSpec(shape=(256, 35), dtype=tf.int32, name=None))>


2024-04-13 01:28:03.313166: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-13 01:28:03.313336: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-13 01:28:03.314245: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [17]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)      #vocab_size로 입력되어서 그걸 embedding_size 만큼으로 표현할 것
        self.rnn_1 = tf.keras.layers.SimpleRNN(hidden_size, return_sequences = True)
        self.rnn_2 = tf.keras.layers.SimpleRNN(hidden_size, return_sequences = True)
        self.linear = tf.keras.layers.Dense(vocab_size)     #vocab_size로 줄여야 다음에 무슨 단어를 낼지에 대한 각 클래스 별 수치가 출력
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

In [18]:
embedding_size = 256    #word embedding 차원 수. 즉, 단어가 추상적으로 표현되는 크기 -> dataset의 shape과 같아야 하는거 아닌지???
hidden_size = 1024      #hidden state의 차원 수
model = TextGenerator(tokenizer.num_words + 1, embedding_size, hidden_size)
model

<TextGenerator name=text_generator, built=False>

In [19]:
for src_sample, tgt_sample in dataset_train.take(1): break
model(src_sample)

<tf.Tensor: shape=(256, 35, 25001), dtype=float32, numpy=
array([[[-3.03771417e-03, -8.19578301e-03,  1.54584798e-03, ...,
          6.85258943e-04, -1.03847422e-02, -5.12672309e-03],
        [-8.12245067e-03,  3.23392916e-04, -1.50562450e-02, ...,
          1.49418442e-02,  6.01206254e-03, -1.42006222e-02],
        [ 1.32533023e-05,  2.06482206e-02,  9.89287160e-03, ...,
          3.37677589e-03,  8.69248062e-03, -6.55518845e-03],
        ...,
        [-2.95723788e-02, -4.89470549e-02,  7.39559298e-04, ...,
         -1.36053236e-02, -1.44807659e-02,  2.75651105e-02],
        [-1.30156158e-02,  1.53294103e-02, -7.71084949e-02, ...,
         -5.90176880e-02, -4.26432397e-03,  1.98519621e-02],
        [ 8.43936577e-03,  1.08757224e-02, -3.63128074e-02, ...,
          3.31647918e-02,  1.21574029e-02, -4.41858470e-02]],

       [[-3.03771417e-03, -8.19578301e-03,  1.54584798e-03, ...,
          6.85258943e-04, -1.03847422e-02, -5.12672309e-03],
        [-1.14499517e-02, -4.24445001e-03, -4

In [20]:
model.summary()

In [21]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


2024-04-13 01:28:16.531165: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


True

2024-04-13 01:28:16.531365: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-13 01:28:16.531474: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-13 01:28:16.531624: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [22]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

In [23]:
model.compile(loss = loss, optimizer = optimizer)
model.fit(dataset_train, epochs = 10)

Epoch 1/10


I0000 00:00:1712971702.119369  527802 service.cc:145] XLA service 0x7f949c004850 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1712971702.119408  527802 service.cc:153]   StreamExecutor device (0): NVIDIA A16-16Q, Compute Capability 8.6
2024-04-13 01:28:22.170100: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-13 01:28:22.641004: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907




I0000 00:00:1712971709.935412  527802 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1381s[0m 387ms/step - loss: 1.0624
Epoch 2/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1374s[0m 388ms/step - loss: 0.7868
Epoch 3/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1373s[0m 387ms/step - loss: 0.7487
Epoch 4/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1372s[0m 387ms/step - loss: 0.7279
Epoch 5/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1371s[0m 387ms/step - loss: 0.7142
Epoch 6/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1370s[0m 387ms/step - loss: 0.7057
Epoch 7/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1370s[0m 386ms/step - loss: 0.6999
Epoch 8/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1369s[0m 386ms/step - loss: 0.6952
Epoch 9/10
[1m3540/3540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1369s[0m 386ms/step - loss: 0.6933
Epoch 10/10
[1m3540/3540[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7f957d042e10>

In [24]:
model.save('rnn_eng.keras')
model.save('rnn_eng.h5')
tf.saved_model.save(model, 'rnn_eng.tf')



INFO:tensorflow:Assets written to: rnn_eng.tf/assets


INFO:tensorflow:Assets written to: rnn_eng.tf/assets


In [25]:
def generate_text(model, tokenizer, init_sentence = '<start>', max_len = 30):
    #테스트를 위해 입력받은 init_sentence도 텐서 변환
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype = tf.int64)
    end_token = tokenizer.word_index['<end>']
    
    #단어를 하나씩 예측해 문장 생성
    while True:
        #1. 입력받은 문장의 텐서 입력
        predict = model(test_tensor)
        #2. 예측된 값 중 가장 높은 확률인 word index를 출력
        predict_word = tf.argmax(tf.nn.softmax(predict, axis = -1), axis = -1)[:, -1]
        #3. 2에서 예측된 word index를 문장 뒤에 붙임
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis = 0)], axis = -1)
        #4. 모델이 <end>를 예측했거나 max_len에 도달하면 문장 생성을 마침
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break
        
    generated = ''
    #tokenizer를 활용해 word index를 단어로 하나씩 변환
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + ' '
        
    return generated

In [26]:
generate_text(model, tokenizer, init_sentence = '<start> i')

'<start> i m in toronto <end> '

In [27]:
generate_text(model, tokenizer, init_sentence = '<start> i', max_len = 50)

'<start> i m in toronto <end> '

In [28]:
generate_text(model, tokenizer, init_sentence = '<start> i love')

'<start> i love you <end> '

In [29]:
generate_text(model, tokenizer, init_sentence = '<start> love')

'<start> love is lovely, twisted <end> '

In [30]:
generate_text(model, tokenizer, init_sentence = '<start> Love is')

'<start> love is lovely, twisted <end> '

In [31]:
generate_text(model, tokenizer, init_sentence = '<start> love you ')

'<start> love you everywhere, <end> '

In [32]:
die_trying = ['coming', 'everybody', 'steppin', 'ready', 'hands', 'i', 'now', 'see', 'someone', 'we']
for word in die_trying:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> coming home <end> 
<start> everybody now, let me know <end> 
<start> steppin out <end> 
<start> ready to freefall <end> 
<start> hands on your body <end> 
<start> i m in toronto <end> 
<start> now i m on the outside <end> 
<start> see you in the rubble <end> 
<start> someone like you <end> 
<start> we re all in the same boat <end> 


In [33]:
take_me_home = ['take', 'i', 'they', 'so', 'there', 'right', 'and', 'no', 'seems', 'still']
for word in take_me_home:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> take me home <end> 
<start> i m in toronto <end> 
<start> they don t know how to say <end> 
<start> so i m not going down <end> 
<start> there s no one <end> 
<start> right now right now <end> 
<start> and i m strikin <end> 
<start> no one s ever gotten <end> 
<start> seems like i m stoned <end> 
<start> still i m addicted to the <end> 


In [34]:
imported_couches = ['they', 'i', 'i just', 'what', 'that', 'got', 'but', 'you', 'took', 'bitch']
for word in imported_couches:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> they don t know how to say <end> 
<start> i m in toronto <end> 
<start> i just want to be with you <end> 
<start> what you shackled in <end> 
<start> that s the motto <end> 
<start> got a lot of shit <end> 
<start> but i m not dead <end> 
<start> you re the best of me <end> 
<start> took a sip and a goose <end> 
<start> bitch i m from zoo york <end> 


In [35]:
better_man = ['how', 'and', 'i', 'saying', 'so', 'if', 'would', 'to', 'please', 'you']
for word in better_man:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> how you move, i m not <end> 
<start> and i m strikin <end> 
<start> i m in toronto <end> 
<start> saying it s alright <end> 
<start> so i m not going down <end> 
<start> if you re resonating <end> 
<start> would you be my answer <end> 
<start> to the floor <end> 
<start> please don t leave me <end> 


<start> you re the best of me <end> 


In [36]:
god_of_my_dreams = ['you', 'god', 'covered', 'trusting', 'speak', 'the', 'night', 'lord', 'i', 'keep']
for word in god_of_my_dreams:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> you re the best of me <end> 
<start> god i hate <end> 
<start> covered in gold <end> 
<start> trusting you re sinking <end> 
<start> speak of the blue <end> 
<start> the way that i go <end> 
<start> night night <end> 
<start> lord i m gonna love you <end> 
<start> i m in toronto <end> 
<start> keep me satisfied <end> 


In [37]:
song_of_the_dusk = ['through', 'far', 'no', 'we', 'shadows', 'where', 'i', 'in', 'hearken', 'the']
for word in song_of_the_dusk:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> through the night <end> 
<start> far away <end> 
<start> no one s ever gotten <end> 
<start> we re all in the same boat <end> 
<start> shadows on the hills of the world <end> 
<start> where you belong <end> 
<start> i m in toronto <end> 
<start> in the morning <end> 
<start> <unk> <unk> <unk> <unk> <end> 
<start> the way that i go <end> 


In [38]:
found_heaven = ['no', 'can', 'everybody', 'you', 'father', 'but', 'heart', 'don', 'there', 'your']
for word in found_heaven:
    word = '<start> ' + word
    print(generate_text(model, tokenizer, init_sentence = word))

<start> no one s ever gotten <end> 
<start> can t trust you <end> 


<start> everybody now, let me know <end> 
<start> you re the best of me <end> 
<start> father to the <unk> <end> 
<start> but i m not dead <end> 
<start> heart is like a dream <end> 
<start> don t you know <end> 
<start> there s no one <end> 
<start> your body is warm <end> 
