In [1]:
from os import listdir

In [2]:
artist = "Imagine Dragons"
dataPath = f"./Dataset/{artist}/"
lyrics = ""

for song in listdir(dataPath):
    with open(dataPath + song, mode = "r") as songFile:
        lyrics += songFile.read().lower()
        
lyrics = lyrics.split("\n")
print(f"Number of sentences: {len(lyrics)}")

Number of sentences: 3490


In [3]:
import tensorflow
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics)               # Generate the word index dictionary
totalWords = len(tokenizer.word_index) + 1   # With padding which is 0

print(f"Total num words: {totalWords}")
print("=====")
print("Word Index Dictionary")
print(tokenizer.word_index)

Total num words: 1982
=====
Word Index Dictionary
{'the': 1, 'i': 2, 'you': 3, 'to': 4, 'me': 5, 'a': 6, 'and': 7, 'my': 8, 'it': 9, "i'm": 10, 'of': 11, 'in': 12, 'we': 13, 'oh': 14, 'all': 15, 'that': 16, "it's": 17, 'la': 18, 'for': 19, 'is': 20, 'up': 21, 'your': 22, "don't": 23, 'just': 24, 'love': 25, 'be': 26, 'hey': 27, 'on': 28, 'this': 29, 'can': 30, 'know': 31, 'but': 32, 'are': 33, 'out': 34, 'down': 35, 'let': 36, 'so': 37, 'been': 38, 'what': 39, 'with': 40, 'never': 41, 'get': 42, "i've": 43, 'got': 44, 'now': 45, 'go': 46, 'life': 47, 'come': 48, 'when': 49, 'take': 50, "i'll": 51, 'not': 52, 'do': 53, 'from': 54, 'time': 55, 'if': 56, 'like': 57, 'no': 58, 'make': 59, 'everything': 60, 'one': 61, "you're": 62, 'wanna': 63, 'tell': 64, 'see': 65, 'look': 66, 'will': 67, 'want': 68, 'over': 69, 'at': 70, 'where': 71, 'was': 72, 'feel': 73, "'cause": 74, 'they': 75, 'back': 76, 'ready': 77, 'eyes': 78, 'way': 79, 'am': 80, 'believe': 81, 'away': 82, 'mind': 83, 'could': 8

In [5]:
input_sequences = []

for line in lyrics:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)
        
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = "pre"))
xs, labels = input_sequences[:,:-1] , input_sequences[:,-1]
ys = tensorflow.keras.utils.to_categorical(labels, num_classes = totalWords)

In [7]:
sentence = lyrics[0].split(" ")
print(f"Sample sentence: {sentence}")

token_list = []
for word in sentence: 
    token_list.append(tokenizer.word_index[word])

print(f"Token list: {token_list}")

print(f"Padded token list: {xs[0]}")
print(f"Decoded token list to text: {tokenizer.sequences_to_texts([xs[0]])}")

print(f"One-hot label: {ys[0]}")
print(f"Index of label: {np.argmax(ys[0])}")

Sample sentence: ['good', 'morning', 'my', 'love']
[324, 365, 8, 25]
