In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string, os
import tensorflow as tf

# keras module for building LSTM
from keras.utils import pad_sequences
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
# csv file
df = pd.read_csv('lyrics-data.csv')
# Assuming you have a DataFrame called df
# Check if "ArtistName" is in the "artist" column
is_artist_present = df['ALink'].isin(['/beatles/'])

# Check if "ArtistName" is present in the DataFrame
if is_artist_present.any():
    print("ArtistName is present in the DataFrame")
else:
    print("ArtistName is not present in the DataFrame")

ArtistName is not present in the DataFrame


In [25]:
df = df[df['language'] =='en']
df = df[df['ALink'] == '/john-mayer/']
df.drop(['ALink','SName','SLink'],axis=1,inplace=True)
display(df)

Unnamed: 0,Lyric,language
17367,Gravity is working against me\nAnd gravity wan...,en
17368,It's not a silly little moment\nIt's not the s...,en
17369,Your love is bright as ever\nEven in the shado...,en
17370,Lightning strikes\nInside my chest to keep me ...,en
17371,I'm the boy in your other phone\nLighting up i...,en
...,...,...
17513,"River's strong, you can't swim inside it\nWe c...",en
17514,Only a nascent trying to harness huge fire\nOu...,en
17515,It's been so long\nsince I've seen you.\nAnd I...,en
17516,A great Big Bang and dinosaurs\nFiery raining ...,en


In [26]:
# Tokenization
tokenizer = Tokenizer()
# Used to preprocess the text, by removing the comma and
# other punctuations for example. We also removed numbers
# It also splits the sentences to words, and turns all words
# into lower case

tokenizer.fit_on_texts(df['Lyric'].astype(str).str.lower())
# Applying the tokenizer

total_words = len(tokenizer.word_index)+1
print(total_words)
# We add one to the index, as index starts from zero

tokenized_sentences = tokenizer.texts_to_sequences(df['Lyric'].astype(str))
# Turns the words into integer type by classifying them according to word dictionary.
#tokenized_sentences[0]
# Checking the first element, to understand what happened
# Slash sequences into n gram sequence
input_sequences = list()
for i in tokenized_sentences:
    for t in range(1, len(i)):
        n_gram_sequence = i[:t+1]
        input_sequences.append(n_gram_sequence)
# Having reached this point, input_sequences contains array of lists of words, starting from
# a list containg 2 words, then we append a third word to them, then another word, and so on.
# This causes a problem, as the length of the lists are not equal, so we pad them by adding zeros
# at the beginnig, till they are all of the same length (the maximum length)

# Pre padding
max_sequence_len = max([len(x) for x in input_sequences])
print (max_sequence_len)
# Gets the length of the longest list in the
# array of lists
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# Dividing the data into X, y -----> the training set, and the labels to be predicted
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
print(X, labels)
# Takes all elemnts in each row, except the last element, and places them in X
# while labels takes the last element (the element which we should predict)

y = tf.keras.utils.to_categorical(labels, num_classes=total_words) # One hot encoding
print(y)
# number of classes is now equal to the number of unique words in the song lyrics
# creating model
model = Sequential()
#model = Sequential()
model.add(Embedding(total_words, 40, input_length=max_sequence_len-1))
# dimension of input: total_words, the number of unique words we have
# 40: the desired dimension of the output
# input_length: the sequence length is all the words except the last one (the one
# we will predict)
model.add(Bidirectional(LSTM(250))) # 250 is the average number of words in a song
# So our cycle is the average length of a song
# We used LSTM instead of simple RNN as simple RNN faces a vanishing gradient
# problem, also, we need to remember the previous words, to predict the next words.

model.add(Dropout(0.1)) # To overcome overfitting
model.add(Dense(total_words, activation='softmax'))

2546
565
[[  0   0   0 ...   0   0 574]
 [  0   0   0 ...   0 574  15]
 [  0   0   0 ... 574  15 389]
 ...
 [  0   0   0 ...  42 293 135]
 [  0   0   0 ... 293 135 731]
 [  0   0   0 ... 135 731   3]] [ 15 389 654 ... 731   3  43]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [27]:
def complete_this_song(input_text, next_words, model):
    for _ in range(next_words):
        # for _ in... this is like a place holder, which upholds the syntax.
        # We use it when we don't want to use the variable, so we leave it empty.

        # Doing the same things to the input as we did when training the model
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted = np.argmax(model.predict(token_list), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                # Gets the word corresponding the the value predicted
                # [Converting from numeric to string again]
                output_word = word
                break
        input_text += " " + output_word
    return input_text

In [28]:
model = load_model('song_lyrics_generator_john_mayer.h5')

In [29]:
complete_this_song("I miss the nights with you", 50, model)



"I miss the nights with you always done the same one i will be good to the level i tried the one but one is is the same mistake i used to make me if i ever get around to living i just just like you think i'm gonna be just from me and i want"