### Setup

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

2024-03-19 22:41:39.285916: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 22:41:42.435567: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 22:41:42.445284: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Get the dataset

In [2]:
songdata = pd.read_csv("/home/login/Documents/songdata.csv", dtype = str)[:250]
songdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  250 non-null    object
 1   song    250 non-null    object
 2   link    250 non-null    object
 3   text    250 non-null    object
dtypes: object(4)
memory usage: 7.9+ KB


### 250 songs


### Preprocessing

In [3]:
def tokenize_corpus(corpus, num_words = -1):
    if num_words > -1:
        tokenizer = Tokenizer(num_words = num_words)
    else:
        tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    return tokenizer

def create_lyrics_corpus(dataset, field):
    dataset[field] = dataset[field].str.replace("[{}]".format(string.punctuation), "")
    dataset[field] = dataset[field].str.lower()
    lyrics = dataset[field].str.cat()
    corpus = lyrics.split("\n")
    
    for _ in range(len(corpus)):
        corpus[_] = corpus[_].rstrip()
        
    corpus = [_ for _ in corpus if _ != ""]
    return corpus

In [4]:
corpus = create_lyrics_corpus(songdata, "text")
tokenizer = tokenize_corpus(corpus)

total_words = len(tokenizer.word_index) + 1
print(total_words)

4151


### Create sequences

In [5]:
sequences = []
i = 0
for line in corpus:
    tok_line = tokenizer.texts_to_sequences([line])[0]
    for _ in range(1, len(tok_line)):
        line_sequences = tok_line[:_+1]
        sequences.append(line_sequences)
        
maxLength = max([len(_) for _ in sequences])
padded_sequences = pad_sequences(sequences, maxlen = maxLength, padding = "pre")
print(maxLength)
print(padded_sequences[0, :])

20
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 112  65]


### Create labels

In [6]:
inputs, labels = padded_sequences[:, :-1], padded_sequences[:, -1]
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes = total_words)
print(inputs[0])
print(labels[0])
print(one_hot_labels[0])
print(one_hot_labels[1])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 112]
65
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


### Train a Better Text Generation Model

In [7]:
embedding_dim = 64
model = Sequential([
    Embedding(total_words, embedding_dim, input_length = maxLength - 1),
    Bidirectional(LSTM(20)),
    Dense(total_words, activation = "softmax")
])
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(),
             optimizer = tf.keras.optimizers.Adam(),
             metrics = ['accuracy'])

In [None]:
EPOCHS = 100
history = model.fit(inputs, one_hot_labels, epochs = EPOCHS, verbose = 1)

2024-03-19 22:42:18.186923: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 836675560 exceeds 10% of free system memory.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

### Visualize the training

In [None]:
def plot_graphs(hist, string):
    plt.plot(hist.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()
    
plot_graphs(history, "accuracy")

### Generate better Lyrics

In [None]:
seed_text = "I'm feeling chills"
next_words = 100

def generate_lyrics(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences(token_list, maxlen = maxLength, padding = "pre")
        predict = model.predict(token_list)
        predicted = np.argmax(predict, axis = 1)
        output = ""
        
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output = word
                break
        seed_text += " " + output
    return seed_text

In [None]:
print(generate_lyrics(seed_text, next_words))
print(generate_lyrics("I am feeling chills", 20))
print(generate_lyrics("Let's go home", 50))