In [1]:
pip install langdetect



# Import necessary libraries

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import json
import tensorflow as tf
import tensorflow.keras as keras
!pip install scikeras
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import LSTM, Embedding, Dense, Dropout,Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from keras.models import load_model
from langdetect import detect
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/AI/lyrics-data.csv')

In [4]:
data.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [5]:
#Printing the size of dataset
print("Size of Dataset:",data.shape)

Size of Dataset: (379931, 5)


# Filter only English lyrics

In [6]:
# Filter only English lyrics
data = data[data['language'] == 'en']
data

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


# Dropping all columns except the lyrics colunm

In [7]:
drop_features = ['ALink', 'SName', 'SLink', 'language']
data.drop(drop_features, axis = 1, inplace = True)
data

Unnamed: 0,Lyric
69,I feel so unsure\nAs I take your hand and lead...
86,"Don't let them fool, ya\nOr even try to school..."
88,"Baby, let's cruise, away from here\nDon't be c..."
111,"Know it sounds funny\nBut, I just can't stand ..."
140,You've got that look again\nThe one I hoped I ...
...,...
379926,Chorus\nHere we stand waiting on the plain\nDa...
379927,I nearly disappeared into the mouth of a croco...
379928,"Amambuka, amambuka azothengisa izwe lakithi, i..."
379929,Sweat in the heat for days on end\nwaiting for...


In [8]:
data.head()

Unnamed: 0,Lyric
69,I feel so unsure\nAs I take your hand and lead...
86,"Don't let them fool, ya\nOr even try to school..."
88,"Baby, let's cruise, away from here\nDon't be c..."
111,"Know it sounds funny\nBut, I just can't stand ..."
140,You've got that look again\nThe one I hoped I ...


In [9]:
# Limiting the 'data' variable to the first 200 elements
data = data[:200]


In [10]:
# shape
data.shape

(200, 1)

# Tokenization

In [11]:
# Create a Tokenizer object
tokenizer = Tokenizer()

# Fit the tokenizer on the text data in the 'Lyric' column, converting to lowercase
tokenizer.fit_on_texts(data['Lyric'].astype(str).str.lower())

# Get the total number of unique words in the dataset + 1 (to account for padding)
total_words = len(tokenizer.word_index) + 1

# Convert the text in the 'Lyric' column to sequences of corresponding tokens
tokenized_sentences = tokenizer.texts_to_sequences(data['Lyric'].astype(str))

# Display the tokenized representation of the first sentence in the dataset
tokenized_sentences[0]


[2,
 59,
 20,
 2513,
 144,
 2,
 67,
 14,
 313,
 7,
 928,
 1,
 5,
 3,
 197,
 234,
 144,
 3,
 534,
 2514,
 257,
 13,
 14,
 171,
 1288,
 5,
 117,
 10,
 2515,
 1134,
 7,
 19,
 431,
 860,
 2516,
 11,
 48,
 139,
 197,
 192,
 1289,
 660,
 109,
 32,
 30,
 861,
 510,
 34,
 400,
 5,
 694,
 2,
 25,
 39,
 61,
 10,
 432,
 1290,
 740,
 258,
 175,
 5,
 1135,
 10,
 368,
 7,
 627,
 3,
 929,
 15,
 97,
 73,
 806,
 20,
 11,
 48,
 139,
 197,
 192,
 3,
 69,
 2,
 1136,
 33,
 1,
 52,
 37,
 48,
 2517,
 3,
 2518,
 2519,
 28,
 10,
 82,
 368,
 5,
 3,
 154,
 7,
 117,
 2520,
 29,
 557,
 213,
 30,
 1291,
 13,
 3,
 695,
 369,
 29,
 19,
 140,
 158,
 11,
 48,
 139,
 197,
 192,
 1289,
 660,
 109,
 32,
 30,
 861,
 510,
 34,
 400,
 5,
 694,
 2,
 25,
 39,
 61,
 10,
 432,
 1290,
 740,
 258,
 175,
 5,
 1135,
 10,
 368,
 7,
 627,
 23,
 929,
 15,
 97,
 73,
 806,
 20,
 11,
 48,
 139,
 197,
 192,
 3,
 69,
 2,
 1136,
 33,
 1,
 48,
 239,
 14,
 12,
 79,
 3,
 534,
 558,
 20,
 930,
 2,
 445,
 15,
 27,
 120,
 343,
 23,
 2521,
 535,
 3

In [12]:
# Retrieve the word index mapping from the tokenizer
word_index = tokenizer.word_index

# Display or further use the obtained word index
word_index


{'you': 1,
 'i': 2,
 'the': 3,
 'me': 4,
 'to': 5,
 'my': 6,
 'and': 7,
 'it': 8,
 'on': 9,
 'a': 10,
 "i'm": 11,
 'love': 12,
 'in': 13,
 'your': 14,
 'that': 15,
 'be': 16,
 'oh': 17,
 'up': 18,
 'all': 19,
 'so': 20,
 "don't": 21,
 'like': 22,
 'this': 23,
 'baby': 24,
 'know': 25,
 'for': 26,
 'we': 27,
 'of': 28,
 'is': 29,
 'no': 30,
 'go': 31,
 'got': 32,
 'with': 33,
 "it's": 34,
 'but': 35,
 'if': 36,
 'can': 37,
 'let': 38,
 "you're": 39,
 'when': 40,
 'now': 41,
 'just': 42,
 'see': 43,
 'get': 44,
 'one': 45,
 'out': 46,
 "ain't": 47,
 'never': 48,
 'do': 49,
 'want': 50,
 'what': 51,
 'time': 52,
 'wanna': 53,
 'say': 54,
 'he': 55,
 'back': 56,
 'right': 57,
 'put': 58,
 'feel': 59,
 'hey': 60,
 'not': 61,
 'they': 62,
 'uh': 63,
 'girl': 64,
 'think': 65,
 'top': 66,
 'take': 67,
 'need': 68,
 'way': 69,
 'body': 70,
 "can't": 71,
 'come': 72,
 'been': 73,
 'down': 74,
 'make': 75,
 'how': 76,
 'boy': 77,
 'halo': 78,
 'tonight': 79,
 'was': 80,
 'yeah': 81,
 'good': 82,

#Data Preprocessing

In [13]:
input_sequences = list()
for i in tokenized_sentences:
    for t in range(1, len(i)):
        n_gram_sequence = i[:t+1]
        input_sequences.append(n_gram_sequence)
# Pre padding
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Feature Engineering

In [14]:
# Feature Engineering: One-hot encoding for categorical crossentropy loss
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Training the Model

In [None]:
# Create the model
model = Sequential()

# Embedding layer to represent words as vectors
model.add(Embedding(input_dim=total_words, output_dim= 50, input_length=max_sequence_len-1))

# Bidirectional LSTM layer for capturing context from both directions
model.add(Bidirectional(LSTM(100)))

# Dropout layer to prevent overfitting.
model.add(Dropout(0.1))

# Dense layer with softmax activation for output probabilities
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting.
earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=256, verbose=1, callbacks=[earlystop], validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10

# Model Evaluation

In [None]:
# Evaluate the model on the test set
evaluation = model.evaluate(X_test, y_test)

# Extract relevant metrics
loss, accuracy = evaluation[0], evaluation[1]

# Print the evaluation results
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

# Plot training loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot training accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Training the Model using GridSearchCV

In [None]:
# Function to create the model
def create_model(embedding_dim=50, lstm_units=50, dropout_rate=0.1, dense_units1=256, dense_units2=128):
    model = Sequential()
    model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(lstm_units)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create a KerasClassifier with the create_model function
keras_classifier = KerasClassifier(build_fn=create_model, epochs=10, batch_size=128, verbose=1)

# Define the hyperparameter grid
param_grid = {
    'model__embedding_dim': [50],
    'model__lstm_units': [50],
    'model__dropout_rate': [0.1],
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=keras_classifier, param_grid=param_grid, cv=3)

# Fit the model to the data
grid_search_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search_result.best_params_)

# Get the best model
best_model = grid_search_result.best_estimator_.model

# Evaluate the best model on the test set
test_evaluation = best_model.evaluate(X_test, y_test)

# Extract relevant metrics
test_loss, test_accuracy = test_evaluation[0], test_evaluation[1]

# Print the test set performance
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')


# Function to generate lyrics

In [None]:
# Function to generate lyrics
def generate_lyrics_to_continue(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example usage of the generate_lyrics function
generated_lyrics = generate_lyrics_to_continue("Sing to me", 20)
generated_lyrics

#Saving the Model and the Tokenizer

In [None]:
# Save model
model.save('song_lyrics_generator.h5')

In [None]:
# Save tokenizer to a file
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))