In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('data/lyrics-data.csv')

In [3]:
df.columns

Index(['ALink', 'SName', 'SLink', 'Lyric', 'Idiom'], dtype='object')

In [4]:
df.shape

(209522, 5)

In [5]:
pd.DataFrame.sum(df['Idiom'] == 'ENGLISH')

114723

In [6]:
df = df[df['Idiom'] == 'ENGLISH']

In [7]:
df = pd.concat([df['SName'], df['Lyric']], axis=1, keys=['title', 'lyrics'])

In [8]:
def clean_text(df, col):
    df[col] = df[col].str.lower().replace("[^a-zA-Z]+", "")
    df[col] = df[col].str.rstrip()
    return df

In [9]:
for col in ['title', 'lyrics']:
    df = clean_text(df, col)

In [10]:
df

Unnamed: 0,title,lyrics
0,more than this,i could feel at the time. there was no way of ...
1,because the night,"take me now, baby, here as i am. hold me close..."
2,these are days,these are. these are days you'll remember. nev...
3,a campfire song,"a lie to say, ""o my mountain has coal veins an..."
4,everyday is like sunday,trudging slowly over wet sand. back to the ben...
...,...,...
207619,this love,i was so high i did not recognize. the fire bu...
207624,valerie,well sometimes i go out by myself. and i look ...
207628,wake me up,feeling my way through the darkness. guided by...
207792,don't,"don't, don't, that's what you say. each time t..."


In [11]:
input_text = np.array(df['lyrics'])

In [12]:
input_text = np.reshape(input_text, newshape=(input_text.shape[0], 1))

In [13]:
input_text.shape

(114723, 1)

In [14]:
words = ""
for song in df['lyrics']:
    words += song
for title in df['title']:
    words += title

word_set = set(words)


In [15]:
char2idx = {'<BOS>':0, '<EOS>':1}
counter = 2
for word in word_set:
    char2idx[word] = counter
    counter += 1
idx2char = {}
for word, idx in char2idx.items():
    idx2char[idx] = word

In [16]:
vocab_size = len(char2idx)

In [17]:
onehot_input = None

In [18]:
max_input_len = 75
max_output_len = 50
data_size = 20000 #len(df['lyrics'])

In [19]:
onehot_encoder_input = np.zeros(shape=(data_size, max_input_len, vocab_size))

In [20]:
for i, song in enumerate(df['lyrics']):
    if i >= data_size:
        break
    for j, ch in enumerate(song):
        if j >= max_input_len:
            break
        charidx = char2idx[ch]
        onehot_encoder_input[i][j][charidx] = 1

In [21]:
onehot_decoder_input = np.zeros(shape=(data_size, max_output_len, vocab_size))
for i in range(onehot_decoder_input.shape[0]):
    onehot_decoder_input[i][0][char2idx["<BOS>"]] = 1

In [22]:
for i, title in enumerate(df['title']):
    if i >= data_size:
        break
    for j, ch in enumerate(title[:-1]):
        if j+1 >= max_output_len:
            break
        charidx = char2idx[ch]
        onehot_decoder_input[i][j+1][charidx] = 1

In [23]:
onehot_target = np.zeros(shape=(data_size, max_output_len, vocab_size))

In [24]:
onehot_decoder_input.shape

(20000, 50, 252)

In [25]:
for i, title in enumerate(df['title']):
    if i >= data_size:
        break
    for j, ch in enumerate(title):
        if j >= max_output_len:
            break
        charidx = char2idx[ch]
        onehot_target[i][j][charidx] = 1

In [26]:
onehot_target.shape

(20000, 50, 252)

In [27]:
import tensorflow as tf
from tensorflow.keras import Sequential, Input, Model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

In [28]:
latent_dim = 32

In [29]:
def get_model(max_input_len, max_output_len, vocab_size):
    encoder_inputs = Input(shape=(max_input_len, vocab_size))
    encoder = LSTM(latent_dim, input_shape=(None, max_input_len, vocab_size), return_state=True)
    _, state_h, state_c = encoder(encoder_inputs)  # discard encoder sequence

    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(max_output_len, vocab_size))

    decoder = LSTM(latent_dim, input_shape=(None, max_output_len, vocab_size), return_state=True, return_sequences=True)
    decoder_outputs, _, _ = decoder(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [39]:
def get_encoder_decoder(model):
    # Build encoder
    encoder_inputs = model.input[0]
    encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = Model(encoder_inputs, encoder_states)

    # Build decoder
    decoder_inputs = model.input[1]
    decoder_state_input_h = Input(shape=(latent_dim,), name="input_3")
    decoder_state_input_c = Input(shape=(latent_dim,), name="input_4")
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_lstm = model.layers[3]
    decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs
    )
    decoder_states = [state_h_dec, state_c_dec]
    decoder_dense = model.layers[4]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )

    #input_token_index, target_token_index = data_holder.get_token_indeces()
    #reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())
    #reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())

    return encoder_model, decoder_model#, reverse_input_token_index, reverse_target_token_index




In [31]:
batch_size = 64
epochs = 1

model_checkpoint_callback = ModelCheckpoint(
            filepath="./chkpt",
            save_weights_only=True,
            monitor='val_accuracy',
            mode='max',
            save_best_only=False,
            save_freq=batch_size*2)

In [32]:
model = get_model(max_input_len, max_output_len, vocab_size)

In [33]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics='accuracy')

In [34]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 75, 252)]    0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 50, 252)]    0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 32),         36480       ['input_1[0][0]']                
                                 (None, 32),                                                      
                                 (None, 32)]                                                      
                                                                                              

In [35]:
history = model.fit(
        [onehot_encoder_input, onehot_decoder_input],
        onehot_target,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2,
        callbacks=[model_checkpoint_callback]
    )



In [36]:
model.save("model_save")



INFO:tensorflow:Assets written to: model_save/assets


INFO:tensorflow:Assets written to: model_save/assets


In [81]:
def predict(model, input_text, vocab_size, max_input_len):
    input_text = input_text.lower().split(" ")

    encoder_model, decoder_model = get_encoder_decoder(model)

    input_seq = np.zeros((1, max_input_len, vocab_size))
    
    for i, ch in enumerate(input_text):
        if i >= max_input_len:
            break
        if ch in char2idx:
            input_seq[0][i][char2idx[ch]] = 1.0
    
    if len(input_text) < max_input_len:
        input_seq[0][len(input_text)][char2idx["<EOS>"]] = 1.0


    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    
    #print(len(states_value))
    print(states_value)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, max_output_len, vocab_size))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, char2idx["<BOS>"]] = 1.0
    
    #print(target_seq[0,0,:])

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    counter = 1
    while not stop_condition:
        print([target_seq] + states_value)
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample 4 best tokens and pick one randomly
        sampled_token_idxs = np.argpartition(output_tokens[0,-1,:], -4)[-4:]
        #print(sampled_token_idxs)
        #print(output_tokens[0,-1,sampled_token_idxs])
        rand_idx = np.random.randint(0,len(sampled_token_idxs))
        sampled_token_index = sampled_token_idxs[rand_idx]
        sampled_char = idx2char[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "<EOS>" or len(decoded_sentence) >= max_output_len:
            stop_condition = True
            break
        
        print(f"counter = {counter}, len(decoded_sentence) = {len(decoded_sentence)}, max_output_len = {max_output_len}")
        # Update the target sequence (of length 1).
        #target_seq = np.zeros((1, max_output_len, vocab_size))
        target_seq[0, counter, sampled_token_index] = 1.0
        counter += 1

        # Update states
        states_value = [h, c]
    print(target_seq)
    print(target_seq[0,1,:])
    return decoded_sentence

In [82]:
test_sentence = "Would the real slim shady"
predict(model, test_sentence, vocab_size, max_input_len)

[array([[-0.9911237 , -0.98641455, -0.97118396,  0.9780053 , -0.98461825,
        -0.9871928 ,  0.9770257 , -0.9733247 ,  0.9839657 , -0.97857547,
         0.9735738 ,  0.98451644, -0.9843088 ,  0.98234165, -0.9762395 ,
        -0.97389776, -0.97976446,  0.97986525,  0.98354053,  0.9830088 ,
        -0.973853  , -0.9851046 ,  0.9885941 ,  0.97689825,  0.96977127,
         0.98589724,  0.9795812 ,  0.98458487,  0.9686646 , -0.98266023,
        -0.97687507,  0.97459316]], dtype=float32), array([[-7.287014 , -7.9551506, -7.844849 ,  7.4718246, -7.6019   ,
        -6.9906635,  7.3246737, -7.4592094,  7.0649486, -7.3601475,
         7.2492433,  6.750752 , -9.282078 ,  7.1122437, -7.1357517,
        -6.710138 , -7.547663 ,  8.541552 ,  6.716609 ,  7.462011 ,
        -7.4210267, -7.5806193,  7.124933 ,  7.497676 ,  8.94012  ,
         7.079741 ,  7.64035  ,  7.751102 ,  7.8348083, -7.7618866,
        -7.353753 ,  7.904166 ]], dtype=float32)]
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0.

counter = 6, len(decoded_sentence) = 6, max_output_len = 50
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[-0.9992541 , -0.999207  , -0.9991163 ,  0.9992079 , -0.9997044 ,
        -0.9996399 ,  0.99963737, -0.99956083,  0.9996861 , -0.99968183,
         0.9995972 ,  0.99938655, -0.99898297,  0.99940574, -0.9993641 ,
        -0.99927557, -0.999426  ,  0.999452  ,  0.9993286 ,  0.999567  ,
        -0.99960333, -0.9996728 ,  0.99977577,  0.99946606,  0.9996062 ,
         0.999287  ,  0.9995674 ,  0.9997666 ,  0.99915063, -0.9995462 ,
        -0.9993471 ,  0.9997295 ]], dtype=float32), array([[ -8.606203 ,  -8.391781 ,  -8.585782 ,  10.109315 ,  -8.252005 ,
        -10.040135 ,   8.71598  ,  -8.2962055,  10.017179 ,  -8.559063 ,
          9.399113 ,  17.152176 ,  -7.9746976,  12.83

counter = 13, len(decoded_sentence) = 13, max_output_len = 50
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[-0.9992541 , -0.999207  , -0.9991163 ,  0.9992079 , -0.9997044 ,
        -0.9996399 ,  0.99963737, -0.99956083,  0.9996861 , -0.99968183,
         0.9995972 ,  0.99938655, -0.99898297,  0.99940574, -0.9993641 ,
        -0.99927557, -0.999426  ,  0.999452  ,  0.9993286 ,  0.9995669 ,
        -0.99960333, -0.9996728 ,  0.99977577,  0.99946606,  0.9996062 ,
         0.999287  ,  0.9995674 ,  0.9997666 ,  0.99915063, -0.9995462 ,
        -0.9993471 ,  0.9997295 ]], dtype=float32), array([[ -8.598903 ,  -8.3957   ,  -8.591576 ,  10.125264 ,  -8.256924 ,
        -10.074449 ,   8.714707 ,  -8.296865 ,  10.04571  ,  -8.566473 ,
          9.418253 ,  17.15508  ,  -7.9783907,  12.

counter = 22, len(decoded_sentence) = 22, max_output_len = 50
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[-0.9992541 , -0.999207  , -0.9991163 ,  0.9992079 , -0.9997044 ,
        -0.9996399 ,  0.99963737, -0.99956083,  0.9996861 , -0.99968183,
         0.9995972 ,  0.99938655, -0.99898297,  0.99940574, -0.9993641 ,
        -0.99927557, -0.999426  ,  0.999452  ,  0.9993286 ,  0.9995669 ,
        -0.99960333, -0.9996728 ,  0.99977577,  0.99946606,  0.9996062 ,
         0.999287  ,  0.9995674 ,  0.9997666 ,  0.99915063, -0.9995462 ,
        -0.9993471 ,  0.9997295 ]], dtype=float32), array([[ -8.592895 ,  -8.448269 ,  -8.624465 ,  10.181796 ,  -8.255373 ,
        -10.143807 ,   8.712035 ,  -8.297626 ,  10.118415 ,  -8.604467 ,
          9.464218 ,  17.143633 ,  -7.9984446,  12.

counter = 31, len(decoded_sentence) = 31, max_output_len = 50
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[-0.9992541 , -0.999207  , -0.9991163 ,  0.9992079 , -0.9997044 ,
        -0.9996399 ,  0.99963737, -0.99956083,  0.9996861 , -0.99968183,
         0.9995972 ,  0.99938655, -0.99898297,  0.99940574, -0.9993641 ,
        -0.99927557, -0.999426  ,  0.999452  ,  0.9993286 ,  0.999567  ,
        -0.99960333, -0.9996728 ,  0.99977577,  0.99946606,  0.9996062 ,
         0.999287  ,  0.9995674 ,  0.9997666 ,  0.99915063, -0.9995462 ,
        -0.9993471 ,  0.9997295 ]], dtype=float32), array([[ -8.592706 ,  -8.469144 ,  -8.679187 ,  10.270351 ,  -8.288087 ,
        -10.307303 ,   8.708269 ,  -8.306366 ,  10.27287  ,  -8.634754 ,
          9.535174 ,  17.225697 ,  -8.022728 ,  12.

counter = 40, len(decoded_sentence) = 40, max_output_len = 50
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[-0.9992541 , -0.999207  , -0.9991163 ,  0.9992079 , -0.9997044 ,
        -0.9996399 ,  0.99963737, -0.99956083,  0.9996861 , -0.99968183,
         0.9995972 ,  0.99938655, -0.99898297,  0.99940574, -0.9993641 ,
        -0.99927557, -0.999426  ,  0.999452  ,  0.9993286 ,  0.999567  ,
        -0.99960333, -0.9996728 ,  0.99977577,  0.99946606,  0.9996062 ,
         0.999287  ,  0.9995674 ,  0.9997666 ,  0.99915063, -0.9995462 ,
        -0.9993471 ,  0.9997295 ]], dtype=float32), array([[ -8.638733 ,  -8.570403 ,  -8.901869 ,  10.49722  ,  -8.297961 ,
        -10.560374 ,   8.669458 ,  -8.303057 ,  10.5992365,  -8.754229 ,
          9.736588 ,  17.478989 ,  -8.072054 ,  12.

counter = 49, len(decoded_sentence) = 49, max_output_len = 50
[array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[-0.99925447, -0.9992074 , -0.9991167 ,  0.9992083 , -0.9997046 ,
        -0.9996401 ,  0.9996375 , -0.9995611 ,  0.9996863 , -0.99968195,
         0.9995975 ,  0.9993869 , -0.9989835 ,  0.99940604, -0.9993644 ,
        -0.99927604, -0.99942636,  0.99945223,  0.99932885,  0.99956715,
        -0.9996035 , -0.9996729 ,  0.99977595,  0.99946636,  0.9996064 ,
         0.99928737,  0.9995676 ,  0.9997667 ,  0.999151  , -0.9995463 ,
        -0.99934745,  0.99972975]], dtype=float32), array([[ -8.769726,  -8.667611,  -9.276991,  10.898316,  -8.490999,
        -11.190437,   8.672782,  -8.376068,  11.275243,  -8.848374,
          9.94209 ,  17.930563,  -8.207815,  12.664325, -11.3

'aoa aeaoe eeoeeaoooo aeeo e eao aeoaaaoo a aa ee o'