In [None]:
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras.utils as ku
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def create_lyrics_corpus(data_path):
    data = pd.read_excel(data_path, engine='openpyxl', dtype = str)
    lyrics = data["Lyrics"]
    
    lyrics = lyrics.str.cat()
    lyrics = lyrics.split('\n')
    
    for line in lyrics:
        if '[' in line:
            lyrics.remove(line)
            
    temp = pd.DataFrame(lyrics, columns = ["Lyrics"])
    lyrics = temp["Lyrics"]

    lyrics = lyrics.str.replace('[{}]'.format(string.punctuation), '')
    for line in range(len(lyrics)):
        lyrics[line] = lyrics[line].rstrip()
    lyrics = [line for line in lyrics if line != '']
    
    return lyrics

In [None]:
NUM_WORDS = 5000
lyrics = create_lyrics_corpus("/home/mxm1186/Projects/Eminem/Eminem_Lyrics.xlsx")

In [None]:
tokenizer = Tokenizer(NUM_WORDS)
tokenizer.fit_on_texts(lyrics)

sequences = []
for line in lyrics:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		sequences.append(n_gram_sequence)

# Pad sequences for equal input length 
max_sequence_len = max([len(seq) for seq in sequences])
sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))

# Split sequences between the "input" sequence and "output" predicted word
input_sequences, labels = sequences[:,:-1], sequences[:,-1]
# One-hot encode the labels
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=NUM_WORDS)

In [None]:
#create the model
model = Sequential()
model.add(Embedding(NUM_WORDS, 512, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(200, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(NUM_WORDS/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(NUM_WORDS, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="/home/mxm1186/Projects/Eminem/weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = model.fit(input_sequences, one_hot_labels, epochs=10, verbose=1, callbacks=callbacks_list)

In [None]:
model.load_weights(filepath)

In [None]:
#Generate text with the highest probaility only
seed_text = "slim shady is back"
next_words = 100

for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = np.argmax(model.predict(token_list), axis=-1)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)

In [None]:
#Generate text with random word choice
seed_text = "slim shady is back"
next_words = 200
  
for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  predicted_probs = model.predict(token_list)[0]
  predicted = np.random.choice([x for x in range(len(predicted_probs))],
                               p=predicted_probs)
  output_word = ""
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += " " + output_word
print(seed_text)