In [116]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, GRU
from tensorflow.keras.optimizers import Adam

In [134]:
import string
from datetime import datetime

def tokenize_corpus(corpus, num_words=-1):
  # Fit a Tokenizer on the corpus
  
  if num_words > -1:
    tokenizer = Tokenizer(num_words=num_words)
  else:
    tokenizer = Tokenizer()
  tokenizer.fit_on_texts(corpus)
  return tokenizer

def create_lyrics_corpus(dataset, field):
  # Remove all other punctuation
  dataset[field] = dataset[field].str.replace('[{}]'.format(string.punctuation), '')
  # Make it lowercase
  dataset[field] = dataset[field].str.lower()
  # Make it one long string to split by line
  lyrics = dataset[field].str.cat()
  corpus = lyrics.split('\n')
  # Remove any trailing whitespace
  for l in range(len(corpus)):
    corpus[l] = corpus[l].rstrip()
  # Remove any empty lines
  corpus = [l for l in corpus if l != '']

  return corpus

def get_time():
  # Get the current date and time
  current_datetime = datetime.now()

  formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H%M")

  return formatted_datetime

# Selecting 100 hip hop songs randomly

In [99]:
import pandas as pd
all_songs = pd.read_csv("SongsData_hiphop.csv")
random_songs_df = all_songs.sample(n=100, random_state=42)
random_songs_df['Artist'].value_counts()

Artist
Travis-Scott      28
Kendrick-Lamar    21
Eminem            17
J-Cole            16
Snoop-Dogg        16
Maroon-5           1
Ed-Sheeran         1
Name: count, dtype: int64

# Tokeniziation and BoW

In [100]:
corpus = create_lyrics_corpus(random_songs_df,"Lyrics")
tokenizer = tokenize_corpus(corpus)
total_words = len(tokenizer.word_index)+1
print(total_words)

2934


# Generating Sequences to fit in Model

In [101]:
sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		sequences.append(n_gram_sequence)

In [102]:
import numpy as np
max_pad_len = max([len(seq) for seq in sequences])
print(max_pad_len)
padded_sequences = np.array(pad_sequences(sequences,maxlen=max_pad_len,truncating='post'))

24


In [103]:
# Split sequences between the "input" sequence and "output" predicted word
input_sequences, labels = padded_sequences[:,:-1], padded_sequences[:,-1]

# One-hot encode the labels
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Model Architecture and Training

In [111]:
model = Sequential()
model.add(Embedding(total_words, 200, input_length=max_pad_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
history = model.fit(input_sequences, one_hot_labels, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Saving and Reloading

In [139]:
reload_model = load_model("hiphop_model_2023-12-03_0352.h5")

In [140]:
history = reload_model.fit(input_sequences, one_hot_labels, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


In [141]:
model_name = "hiphop_model_" + get_time() + ".h5"
reload_model.save(model_name)
print(model_name)

hiphop_model_2023-12-03_0357.h5


In [144]:
# Use this process for the full output generation
seed_text = "there will be blood"
next_words = 30
  
for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_pad_len-1, padding='post')
  predicted_probs = reload_model.predict(token_list)[0]
  predicted = np.random.choice([x for x in range(len(predicted_probs))],
                               p=predicted_probs)
  output_word = ""
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += " " + output_word
print(seed_text)

there will be blood slick thousand 'til hours you cowards hours handled life how swag swag swag flight under roll both dusk swag and worse provide function i feel caved it cole boobs boobs
