## Ben Summer - Lyrics Identification Neural Network

#### Notes: I Installed The Latest Version Of Pip & Tensorflow Before Beginning

###### Imports

In [64]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# Plotting library
from matplotlib import pyplot as plt

# Optimization module in scipy
from scipy import optimize

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# tells matplotlib to embed plots within the notebook
%matplotlib inline

import pandas as pd

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight

import seaborn as sns

#### Import The Source Data And Tokenize

In [3]:
df = pd.read_csv('shuffled_verses.tsv', sep='\t')
X = np.asarray(df.values[:3975, 6]).astype('str')
X_cv = np.asarray(df.values[3975:5300, 6]).astype('str')
X_test = np.asarray(df.values[5300:, 6]).astype('str')

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_cv_sequences = tokenizer.texts_to_sequences(X_cv)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Add padding
X_padded = pad_sequences(X_sequences, padding='post')
X_cv_padded = pad_sequences(X_cv_sequences, padding='post')
X_test_padded = pad_sequences(X_test_sequences, padding='post')

y = np.asarray(df.values[:3975, 1]).astype('float32')
y_cv = np.asarray(df.values[3975:5300, 1]).astype('float32')
y_test = np.asarray(df.values[5300:, 1]).astype('float32')

df = pd.read_csv('my_artists.tsv', sep='\t')
X_encoding = np.asarray(df.values[:, 0]).astype('str')
y_encoding = np.asarray(df.values[:, 1]).astype('float32')

In [4]:
y_encoding

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34.], dtype=float32)

## MODEL

In [5]:
y_encoding.shape[0]

35

In [14]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(enumerate(class_weights))
print(class_weights)

{0: 0.4351395730706076, 1: 1.1954887218045114, 2: 2.704081632653061, 3: 4.368131868131868, 4: 2.142857142857143, 5: 0.3998993963782696, 6: 0.8736263736263736, 7: 1.695095948827292, 8: 1.695095948827292, 9: 0.43183052688756113, 10: 1.1588921282798834, 11: 1.2211981566820276, 12: 1.5773809523809523, 13: 0.6041033434650456, 14: 1.8027210884353742, 15: 0.32919254658385094, 16: 0.3889432485322896, 17: 3.7857142857142856, 18: 0.7054125998225377, 19: 0.48328267477203646, 20: 2.0280612244897958, 21: 2.0280612244897958, 22: 1.3520408163265305, 23: 0.7571428571428571, 24: 2.2714285714285714, 25: 1.8317972350230414, 26: 1.5557729941291585, 27: 1.0231660231660231, 28: 1.8317972350230414, 29: 3.549107142857143, 30: 0.9706959706959707, 31: 4.2063492063492065, 32: 3.916256157635468, 33: 0.6452922077922078, 34: 4.542857142857143}


In [15]:
# Hyperparameters
embedding_dim = 100
rnn_units = 128 # Changed from 128 to 256
max_sequence_length = X_padded.shape[1]  # Length of the padded sequences
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size (adding 1 for padding token)
num_classes = y_encoding.shape[0]
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(enumerate(class_weights))

# Build The Model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=vocab_size,  # Size of the vocabulary
                    output_dim=embedding_dim))  # Length of input sequences

# RNN
model.add(LSTM(rnn_units))
model.add(Dropout(0.2))
model.add(Dense(70, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(45, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))

# Compile
model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss='sparse_categorical_crossentropy',  # Use 'categorical_crossentropy' for multi-class
              metrics=['accuracy'])

# EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',        # Monitor validation loss
    patience=3,                # Stop if no improvement for 3 consecutive epochs
    restore_best_weights=True  # Restore the best model weights
)

model.summary()

# Train
history = model.fit(X_padded, y,
                    epochs=300,
                    batch_size=32,
                    validation_data=(X_cv_padded, y_cv),
                   class_weight=class_weights)


Epoch 1/300
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 312ms/step - accuracy: 0.0457 - loss: 3.6938 - val_accuracy: 0.0219 - val_loss: 3.7168
Epoch 2/300
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 318ms/step - accuracy: 0.0285 - loss: 3.7450 - val_accuracy: 0.0219 - val_loss: 3.6970
Epoch 3/300
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 306ms/step - accuracy: 0.0202 - loss: 3.5907 - val_accuracy: 0.0128 - val_loss: 3.6807
Epoch 4/300
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 312ms/step - accuracy: 0.0255 - loss: 3.6143 - val_accuracy: 0.0219 - val_loss: 3.6652
Epoch 5/300
[1m111/125[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m4s[0m 292ms/step - accuracy: 0.0160 - loss: 3.6771

KeyboardInterrupt: 

In [None]:
# Step 9: Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")