In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, GRU
from tensorflow.keras.optimizers import Adam

In [5]:
import string
from datetime import datetime

def tokenize_corpus(corpus, num_words=-1):
  # Fit a Tokenizer on the corpus
  
  if num_words > -1:
    tokenizer = Tokenizer(num_words=num_words)
  else:
    tokenizer = Tokenizer()
  tokenizer.fit_on_texts(corpus)
  return tokenizer

def create_lyrics_corpus(dataset, field):
  # Remove all other punctuation
  dataset[field] = dataset[field].str.replace('[{}]'.format(string.punctuation), '')
  # Make it lowercase
  dataset[field] = dataset[field].str.lower()
  # Make it one long string to split by line
  lyrics = dataset[field].str.cat()
  corpus = lyrics.split('\n')
  # Remove any trailing whitespace
  for l in range(len(corpus)):
    corpus[l] = corpus[l].rstrip()
  # Remove any empty lines
  corpus = [l for l in corpus if l != '']

  return corpus

def get_time():
  # Get the current date and time
  current_datetime = datetime.now()

  formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H%M")

  return formatted_datetime

# Selecting 100 hip hop songs randomly

In [6]:
import pandas as pd
all_songs = pd.read_csv("SongsData_hiphop.csv")
random_songs_df = all_songs.sample(n=100, random_state=42)
random_songs_df['Artist'].value_counts()

Artist
Travis-Scott      28
Kendrick-Lamar    21
Eminem            17
J-Cole            16
Snoop-Dogg        16
Maroon-5           1
Ed-Sheeran         1
Name: count, dtype: int64

# Tokeniziation and BoW

In [7]:
corpus = create_lyrics_corpus(random_songs_df,"Lyrics")
tokenizer = tokenize_corpus(corpus)
total_words = len(tokenizer.word_index)+1
print(total_words)

2934


# Generating Sequences to fit in Model

In [8]:
sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		sequences.append(n_gram_sequence)

In [9]:
import numpy as np
max_pad_len = max([len(seq) for seq in sequences])
print(max_pad_len)
padded_sequences = np.array(pad_sequences(sequences,maxlen=max_pad_len,truncating='post'))

24


In [10]:
# Split sequences between the "input" sequence and "output" predicted word
input_sequences, labels = padded_sequences[:,:-1], padded_sequences[:,-1]

# One-hot encode the labels
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Model Architecture and Training

In [18]:
model = Sequential()
model.add(Embedding(total_words, 256, input_length=max_pad_len-1))
#model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
history = model.fit(input_sequences, one_hot_labels, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Saving and Reloading

In [45]:
reload_model = load_model("hiphop_model_x1_2023-12-03_2322.h5")

In [46]:
history = reload_model.fit(input_sequences, one_hot_labels, epochs = 1, verbose=1)



In [41]:
model_name = "hiphop_model_x2_" + get_time() + ".h5"
reload_model.save(model_name)
print(model_name)

hiphop_model_x2_2023-12-03_2337.h5


In [38]:
import matplotlib.pyplot as plt
accuracy = history.history['accuracy']
print(accuracy[-1])
# Plotting accuracy over epochs
# plt.plot(range(1, len(accuracy) + 1), accuracy)
# plt.title('Training Accuracy Over Epochs')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.show()

0.6778765916824341


In [47]:
# Use this process for the full output generation
seed_text = "just do it"
next_words = 50
  
for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_pad_len-1, padding='post')
  predicted_probs = reload_model.predict(token_list)[0]
  predicted = np.random.choice([x for x in range(len(predicted_probs))],
                               p=predicted_probs)
  output_word = ""
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += " " + output_word
print(seed_text)

just do it hoodie eve hoodie hoodie hoodie hoodie hotel hoodie hoodie hotel hotel hotel hotel hotel hotel turn hotel day hotel turn myself lookin' shoot dreams from turn touched from day slummin' touched room'll rock hard mode holding it fuckin' philosophyme woo herself bomb realize four roof sexy indiscretions yeahcome hounds one
