In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string, os
import tensorflow as tf

# keras module for building LSTM
from keras.utils import pad_sequences
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# csv file
df = pd.read_csv('lyrics-data.csv')
# Assuming you have a DataFrame called df
# Check if "ArtistName" is in the "artist" column
is_artist_present = df['ALink'].isin(['/taylor-swift/'])

# Check if "ArtistName" is present in the DataFrame
if is_artist_present.any():
    print("ArtistName is present in the DataFrame")
else:
    print("ArtistName is not present in the DataFrame")

ArtistName is present in the DataFrame


In [3]:
# As the model takes a lot of time to train using the whole dataset
# We will instead use only a small portion of it.
# Training on English songs only
df = df[df['language'] =='en']
df = df[df['ALink'] == '/taylor-swift/']
df.drop(['ALink','SName','SLink'],axis=1,inplace=True)
display(df)

Unnamed: 0,Lyric,language
30942,We could leave the Christmas lights up 'til Ja...,en
30943,"Nice to meet you, where you been?\nI could sho...",en
30944,"Vintage tee, brand new phone\nHigh heels on co...",en
30945,I stay out too late\nGot nothing in my brain\n...,en
30946,I walked through the door with you\nThe air wa...,en
...,...,...
31324,You are somebody that I don't know\nBut you're...,en
31325,All this time I was wasting hoping you would c...,en
31326,All this time I was wasting hoping you would c...,en
31327,"Bet you lie awake at night,\nTrying to make up...",en


In [14]:
# Tokenization
tokenizer = Tokenizer()
# Used to preprocess the text, by removing the comma and
# other punctuations for example. We also removed numbers
# It also splits the sentences to words, and turns all words
# into lower case

tokenizer.fit_on_texts(df['Lyric'].astype(str).str.lower())
# Applying the tokenizer

total_words = len(tokenizer.word_index)+1
# We add one to the index, as index starts from zero

tokenized_sentences = tokenizer.texts_to_sequences(df['Lyric'].astype(str))
# Turns the words into integer type by classifying them according to word dictionary.
#tokenized_sentences[0]
# Checking the first element, to understand what happened
# Slash sequences into n gram sequence
input_sequences = list()
for i in tokenized_sentences:
    for t in range(1, len(i)):
        n_gram_sequence = i[:t+1]
        input_sequences.append(n_gram_sequence)
# Having reached this point, input_sequences contains array of lists of words, starting from
# a list containg 2 words, then we append a third word to them, then another word, and so on.
# This causes a problem, as the length of the lists are not equal, so we pad them by adding zeros
# at the beginnig, till they are all of the same length (the maximum length)

# Pre padding
max_sequence_len = max([len(x) for x in input_sequences])

# Gets the length of the longest list in the
# array of lists
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# Dividing the data into X, y -----> the training set, and the labels to be predicted
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
# Takes all elemnts in each row, except the last element, and places them in X
# while labels takes the last element (the element which we should predict)

y = tf.keras.utils.to_categorical(labels, num_classes=total_words) # One hot encoding
# number of classes is now equal to the number of unique words in the song lyrics
# creating model
model = Sequential()
#model = Sequential()
model.add(Embedding(total_words, 40, input_length=max_sequence_len-1))
# dimension of input: total_words, the number of unique words we have
# 40: the desired dimension of the output
# input_length: the sequence length is all the words except the last one (the one
# we will predict)
model.add(Bidirectional(LSTM(250))) # 250 is the average number of words in a song
# So our cycle is the average length of a song
# We used LSTM instead of simple RNN as simple RNN faces a vanishing gradient
# problem, also, we need to remember the previous words, to predict the next words.

model.add(Dropout(0.1)) # To overcome overfitting
model.add(Dense(total_words, activation='softmax'))

969


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 968, 40)           191480    
                                                                 
 bidirectional (Bidirection  (None, 500)               582000    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 500)               0         
                                                                 
 dense (Dense)               (None, 4787)              2398287   
                                                                 
Total params: 3171767 (12.10 MB)
Trainable params: 3171767 (12.10 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
history = model.fit(X, y, batch_size=32, epochs=20,callbacks=[earlystop], validation_split = 0.3)

Epoch 1/20
   4/2750 [..............................] - ETA: 10:33:04 - loss: 8.4710 - accuracy: 0.0234 

KeyboardInterrupt: 

In [7]:
# save model before continuing the code, in case we run out of memory
from tensorflow.keras.models import load_model


In [None]:
df = pd.DataFrame(history.history)
df[['loss','val_loss']].plot()

In [11]:
model2 = load_model('song_lyrics_generator_Not_One.h5')

In [12]:
# el input_text dh el text elly badeehlo, w bykamel 3leh el o8nya
# next_words dyh 3adad el kalemat el hy3mlha prediction
def complete_this_song(input_text, next_words):
    for _ in range(next_words):
        # for _ in... this is like a place holder, which upholds the syntax.
        # We use it when we don't want to use the variable, so we leave it empty.

        # Doing the same things to the input as we did when training the model
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=1139, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted = np.argmax(model2.predict(token_list), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                # Gets the word corresponding the the value predicted
                # [Converting from numeric to string again]
                output_word = word
                break
        input_text += " " + output_word
    return input_text

In [13]:
complete_this_song("Pleave leave me", 50)



"Pleave leave me all the an ah i see girl you 'bout say to all the an ah i see girl me ooh see all the an ah you see girl know oh i belong in hang to all you come out turn will hate could will belong in hang to all you"