In [3]:
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
import numpy as np
import pandas as pd
import sys
import io
import re
from utils import *
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')

In [4]:
songs = pd.read_csv('data/drake-songs.csv')


In [5]:
def get_tokenized_lines(df):
    words = [] 
    for index, row in df['lyrics'].iteritems():
        row = str(row).lower()
        for line in row.split('|-|'):
            new_words = re.findall(r"\b[a-z']+\b", unidecode(line))
            words = words + new_words
    return words

In [6]:
all_lyric_sentences = get_tokenized_lines(songs)
print(all_lyric_sentences[:10])

['money', 'money', 'cars', 'cars', 'clothes', 'clothes', 'the', 'hoes', 'i', 'suppose']


In [7]:
print(' {} sentences in total'.format(len(all_lyric_sentences)))


 75178 sentences in total


In [8]:
sequences=list()
seq_len = 10
for i in range(len(all_lyric_sentences)-seq_len+1):
    seq = all_lyric_sentences[i:seq_len+i]
    sequences.append(seq)
    
print('Total Sequences: %d' % len(sequences))
print(np.shape(sequences))
print(sequences[:2][:])


Total Sequences: 75169
(75169, 10)
[['money', 'money', 'cars', 'cars', 'clothes', 'clothes', 'the', 'hoes', 'i', 'suppose'], ['money', 'cars', 'cars', 'clothes', 'clothes', 'the', 'hoes', 'i', 'suppose', 'yeah']]


In [10]:
from utils import *
word_to_ind, ind_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')
vocab_size = len(word_to_ind)
print(vocab_size)

400000


In [79]:

def seq_to_indexes(X, word_to_ind):
    
    m = np.shape(X)[0]
    seq_len = np.shape(X)[1]
    X_indexes = np.zeros((m, seq_len))

    for i in range(m):
        sequence_words = X[i]
        j = 0
        for word in sequence_words:
            X_indexes[i, j] = word_to_ind[word] 
            j = j + 1
            
    return X_indexes

In [92]:
from keras.utils import to_categorical
Tokenized_seq = seq_to_indexes(sequences[:20][:], word_to_ind)
X, Y = Tokenized_seq[:,:-1], Tokenized_seq[:, -1]
Y = to_categorical(Y, num_classes=vocab_size)
seq_length = len(X[0])
print(X)

[[248489. 248489.  92762.  92762. 103108. 103108. 357266. 180213. 185457.]
 [248489.  92762.  92762. 103108. 103108. 357266. 180213. 185457. 348225.]
 [ 92762.  92762. 103108. 103108. 357266. 180213. 185457. 348225. 393294.]
 [ 92762. 103108. 103108. 357266. 180213. 185457. 348225. 393294. 185457.]
 [103108. 103108. 357266. 180213. 185457. 348225. 393294. 185457. 383068.]
 [103108. 357266. 180213. 185457. 348225. 393294. 185457. 383068. 357266.]
 [357266. 180213. 185457. 348225. 393294. 185457. 383068. 357266. 248489.]
 [180213. 185457. 348225. 393294. 185457. 383068. 357266. 248489. 248489.]
 [185457. 348225. 393294. 185457. 383068. 357266. 248489. 248489.  54718.]
 [348225. 393294. 185457. 383068. 357266. 248489. 248489.  54718. 357266.]
 [393294. 185457. 383068. 357266. 248489. 248489.  54718. 357266.  92762.]
 [185457. 383068. 357266. 248489. 248489.  54718. 357266.  92762.  92762.]
 [383068. 357266. 248489. 248489.  54718. 357266.  92762.  92762.  54718.]
 [357266. 248489. 248489.

In [93]:
"""vocab = set(all_lyric_sentences)
vocab_size = len(vocab)
word_to_index = {wd:idx for idx,wd in enumerate(vocab)}
index_to_word = {idx:wd for wd,idx in word_to_index.items()}"""

'vocab = set(all_lyric_sentences)\nvocab_size = len(vocab)\nword_to_index = {wd:idx for idx,wd in enumerate(vocab)}\nindex_to_word = {idx:wd for wd,idx in word_to_index.items()}'

In [94]:
"""def token_generator(sequences,seq_length):
    word_token = np.zeros((len(sequences),seq_length))
    for r, line in enumerate(sequences):
        for c, word in enumerate(line):
            word_token[r,c]=word_to_index[word]
    return word_token"""

'def token_generator(sequences,seq_length):\n    word_token = np.zeros((len(sequences),seq_length))\n    for r, line in enumerate(sequences):\n        for c, word in enumerate(line):\n            word_token[r,c]=word_to_index[word]\n    return word_token'

In [10]:
"""tokenized_seq = token_generator(sequences,seq_len) 
print(tokenized_seq[:2,:])"""

[[5178. 5178. 5138. 5138. 5786. 5786. 1664. 5045. 4904. 4327.]
 [5178. 5138. 5138. 5786. 5786. 1664. 5045. 4904. 4327. 3836.]]


In [11]:
"""from keras.utils import to_categorical
X,Y = tokenized_seq[:,:-1], tokenized_seq[:,-1]
Y = to_categorical(Y, num_classes=vocab_size)
seq_length = len(X[0])"""

In [95]:
def pretrained_embedding(word_to_vec_map, word_to_ind,seq_length):
    vocab_len = len(word_to_ind) + 1
    emb_dim = word_to_vec_map['cucumber'].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_ind.items():
        emb_matrix[index,:] = word_to_vec_map[word]
    embedding_layer = Embedding(vocab_len, emb_dim,input_length=seq_length ,trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer 

In [97]:
from keras.layers import Embedding, Input, Dropout
from keras.models import Model


def LyGen_model(input_shape, vocab_size):
    X_input = Input(shape=input_shape)
    
    embedding_layer = pretrained_embedding(word_to_vec_map, word_to_ind, seq_length)
    embeddings = embedding_layer(X_input)
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dense(vocab_size, activation = 'softmax')(X)
    
    model = Model(inputs=X_input, outputs=X)
    
    return model


model = LyGen_model((seq_length,), vocab_size)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(X, Y, epochs=50, batch_size=32)    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 9)                 0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 9, 50)             20000050  
_________________________________________________________________
lstm_3 (LSTM)                (None, 9, 128)            91648     
_________________________________________________________________
dropout_2 (Dropout)          (None, 9, 128)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 400000)            51600000  
Total params: 71,823,282
Trainable params: 51,823,232
Non-trainable params: 20,000,050
_______________________________________________________

Epoch 1/50




Epoch 2/50




Epoch 3/50




Epoch 4/50




Epoch 5/50




Epoch 6/50




Epoch 7/50




Epoch 8/50




Epoch 9/50




Epoch 10/50




Epoch 11/50




Epoch 12/50




Epoch 13/50




Epoch 14/50




Epoch 15/50




Epoch 16/50




Epoch 17/50




Epoch 18/50




Epoch 19/50




Epoch 20/50




Epoch 21/50




Epoch 22/50




Epoch 23/50




Epoch 24/50




Epoch 25/50




Epoch 26/50




Epoch 27/50




Epoch 28/50




Epoch 29/50




Epoch 30/50




Epoch 31/50




Epoch 32/50




Epoch 33/50




Epoch 34/50




Epoch 35/50




Epoch 36/50




Epoch 37/50




Epoch 38/50




Epoch 39/50




Epoch 40/50




Epoch 41/50




Epoch 42/50




Epoch 43/50




Epoch 44/50




Epoch 45/50




Epoch 46/50




Epoch 47/50




Epoch 48/50




Epoch 49/50




Epoch 50/50




<keras.callbacks.History at 0x21155160>

In [None]:
"""model=Sequential()
model.add(Embedding(vocab_size,64,input_length=seq_length))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(128,return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(X,Y,batch_size=32,epochs=100)"""

In [98]:
def load_doc(filename):
    f=open(filename,'r')
    text=f.read()
    f.close()
    return text

In [99]:
filename='data/clean_sequences.txt'
doc=load_doc(filename)
lines=doc.split('\n')

In [103]:
from random import randint
seed_text=lines[randint(0,len(lines))]
print(seed_text)

and they cut deep but it's our world it's just us two see painkillers on the kitchen counter hate to see it all hurt so bad but maybe wouldn't have worked as hard if you were healthy and it weren't so bad uh maybe should walk up the street and try


In [104]:
"""def texts_to_sequences(texts,word_to_index):
    indexes = np.zeros((1, len(texts)), dtype=int)
    for idx, text in enumerate(texts):
        indexes[:,idx] = word_to_index[text]
    return indexes"""



In [105]:
"""indexes=texts_to_sequences(seed_text.split(),word_to_index)
print(indexes)"""

NameError: name 'word_to_index' is not defined

In [106]:
indexes = seq_to_indexes(seed_text.split(), word_to_ind)

IndexError: tuple index out of range

In [117]:
from keras.preprocessing.sequence import pad_sequences
def seq_generator(model,word_to_ind,seq_length,seed_text,n_words):
    result = list()
    input_text = seed_text
    for _ in range(n_words):
        encoded = texts_to_sequences(input_text.split(),word_to_ind)
        encoded = pad_sequences([encoded],maxlen=seq_length, padding='pre', truncating='pre')
        yhat = model.predict_classes(encoded, verbose = 0)
        out_word = ind_to_word[yhat]
    
        input_text += ' ' + out_word
        result.append(out_word)
    return result
    
    
    
    

In [118]:
generated = seq_generator(model, word_to_ind, seq_length, seed_text, 50)
print(generated)

NameError: name 'generate_seq' is not defined