In [1]:
#Importing the libraries needed for preprocessing the text
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

We load the training data which we will split into train and validation

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.shape

(290183, 5)

The dataset is downsized as I am running the models for the 1st task on my laptop (with only CPU), as I was not able to load the dataset correctly in Google Collab.

In [4]:
sample_size=int(0.2*len(df))
df=df.sample(sample_size,random_state=42)

In [5]:
df.shape

(58036, 5)

In [6]:
df.head()

Unnamed: 0,Artist,Song,Genre,Language,Lyrics
180931,big daddy,hold on,Rock,en,I know there's pain\nWhy do lock yourself up i...
269990,aqualung,if i fall,Rock,en,swept away\nby the wonder of it all\nso amazed...
275612,george ezra,benjamin twine,Indie,en,"Let me tell you about my best friend, he got h..."
195014,we came as romans,what i wished i never had,Metal,en,Don't catch me at the wrong time\nOr you will ...
26829,jimmy eat world,softer (she's perfect),Rock,en,She's perfect in her own way.\nSmoke rings ris...


We create a sub dataframe for the first part taking only the Lyrics and Genre columns 

In [7]:
sub_df = df.drop(["Artist", "Song", "Language"], axis= 1)
sub_df

Unnamed: 0,Genre,Lyrics
180931,Rock,I know there's pain\nWhy do lock yourself up i...
269990,Rock,swept away\nby the wonder of it all\nso amazed...
275612,Indie,"Let me tell you about my best friend, he got h..."
195014,Metal,Don't catch me at the wrong time\nOr you will ...
26829,Rock,She's perfect in her own way.\nSmoke rings ris...
...,...,...
81372,Pop,[Intro: Pete Ross & Lil Wayne]\nIs it true you...
94338,Rock,Ev'rything I want I got\nAnd I got you girl\nY...
9830,Pop,Last year is old news\nI'm breaking out my six...
33110,Rock,Você teima!\nVocê teima!\nVocê teima!\nVocê te...


In [8]:
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58036 entries, 180931 to 275495
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Genre   58036 non-null  object
 1   Lyrics  58027 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


We remove the null values in the Lyrics column from the dataframe

In [9]:
sub_df.dropna(inplace=True)

Now we start preprocessing the text. We remove the stopwords, the punctuation and conevrt the text to lowercase.

In [10]:
stop=stopwords.words("english")
sub_df["Lyrics"] = sub_df["Lyrics"].apply(lambda x: " ".join(word for word in x.split() if word not in stop))

In [11]:
# We first remove the \n that is in the lyrics column first as it seperates all the sentences and then remove the punctuation
sub_df["Lyrics"]=sub_df["Lyrics"].apply(lambda x:(re.sub(r"\n",' ',str(x))))
sub_df["Lyrics"]=sub_df["Lyrics"].apply(lambda x:(re.sub( r"[^\w\s]",'',str(x))))

In [12]:
sub_df["Lyrics"]=sub_df["Lyrics"].apply(lambda x: str(x).lower())

In [13]:
sub_df

Unnamed: 0,Genre,Lyrics
180931,Rock,i know theres pain why lock chains no one chan...
269990,Rock,swept away wonder amazed never saw coming left...
275612,Indie,let tell best friend got hair knees he gets al...
195014,Metal,dont catch wrong time or feel wrath the one i ...
26829,Rock,shes perfect way smoke rings rising winter gre...
...,...,...
81372,Pop,intro pete ross lil wayne is true performed w...
94338,Rock,evrything i want i got and i got girl you real...
9830,Pop,last year old news im breaking six string and ...
33110,Rock,você teima você teima você teima você teima e ...


Now we split our dataset into training (80%) and validation (20%) 

In [14]:
from sklearn.model_selection import train_test_split
train, val =train_test_split(sub_df, test_size=0.2)

In [15]:
print(f"train:{train.shape}")
print(f"validation:{val.shape}")

train:(46421, 2)
validation:(11606, 2)


Now we tokenize our lyrics, to feed that input into our models

In [16]:
# We take only the 20000 most frequently occuring words
max_nb_words=20000
tokenizer= Tokenizer(num_words=max_nb_words)
tokenizer.fit_on_texts(train.Lyrics)# we fit the tokenizer on the train set only to avoid data leakage
train_sequences = tokenizer.texts_to_sequences(train.Lyrics)# and tokenize both the train and validation set
val_sequences = tokenizer.texts_to_sequences(val.Lyrics)

To make sure our tokenized sequences are of equal length we use pad the sequences to be of maximum length 30

In [17]:
MAX_SEQUENCE_LENGTH=300
train_data=pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

print(train_data.shape)
print(val_data.shape)

(46421, 300)
(11606, 300)


Now we save the labels in a seperate dataframe, which we will encode using the LabelEncoder to have numerical values (instead of the text labels)

In [18]:
train_labels = train["Genre"]
val_labels = val["Genre"]

In [19]:
le= LabelEncoder()
le.fit(train_labels)

train_labels=le.transform(train_labels)
val_labels=le.transform(val_labels)

print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(val_labels, return_counts=True))

['Country' 'Electronic' 'Folk' 'Hip-Hop' 'Indie' 'Jazz' 'Metal' 'Pop'
 'R&B' 'Rock']
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([  318,   336,  1398,   363,  1391,  2215,  3236, 17311,   409,
       19444], dtype=int64))
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([  81,   91,  335,   94,  343,  558,  823, 4260,  107, 4914],
      dtype=int64))


We can see the 10 classes, which are unbalanced with Pop having by far the most instances.

We convert the nurical labels into one-hot encoded vectors to feed into our models.

In [20]:
labels_train = to_categorical(np.asarray(train_labels))
labels_val = to_categorical(np.asarray(val_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_val.shape)

Shape of data tensor: (46421, 300)
Shape of label tensor: (46421, 10)
Shape of label tensor: (11606, 10)


## Models based only on Lyrics
Now we we start training the models based on the lyrics only

### RNN Variants
#### Basic RNN Model:
First we start with a Simple RNN model.

We import the tensorflow libraries needed.

In [90]:
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, SimpleRNN, LSTM, Activation, Flatten, Embedding, Activation, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, MaxPooling1D, Input, Concatenate, RepeatVector, Reshape
from keras.callbacks import EarlyStopping

We define the batch size of 1000 to speed up training and use a state size of 10 for the RNN and LSTM models

In [91]:
batch_size = 1000
state_size = 10
num_classes = 10 #there are 10 genre classes

For the simple RNN model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) We use a SimpleRNN layer, giving the state size of 10
2) We use Batch Normalization and Dropout of 20% to prevent overfitting as Batch Normalization helps to stabilize and speed up the training process by normalizing the activations and Dropout encourages a sparser network reliant on more independant neurons. 
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [145]:
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
model.add(SimpleRNN(state_size))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model.compile(loss="categorical_crossentropy",
                   optimizer='adam',
                   metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data, labels_val),
          callbacks=[early_stopping])

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_41 (Embedding)    (None, 300, 100)          2000000   
                                                                 
 simple_rnn_10 (SimpleRNN)   (None, 10)                1110      
                                                                 
 batch_normalization (Batch  (None, 10)                40        
 Normalization)                                                  
                                                                 
 dropout_21 (Dropout)        (None, 10)                0         
                                                                 
 dense_24 (Dense)            (None, 10)                110       
                                                                 
Total params: 2001260 (7.63 MB)
Trainable params: 2001240 (7.63 MB)
Non-trainable params: 20 (80.00 Byte)
_____________

<keras.src.callbacks.History at 0x2566bdcca90>

#### Single Layer LSTM Model:
Similarly for the signle layer LSTM model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) For the LSTM layer, we set the state size and return_sequences to True to output all the hidden states for each timestep of the text sequence. The we flatten the output to pass to the Batch Normalization layer.
2) We use Batch Normalization and Dropout of 20% to prevent overfitting as Batch Normalization helps to stabilize and speed up the training process by normalizing the activations and Dropout encourages a sparser network reliant on more independant neurons. 
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [None]:
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length= MAX_SEQUENCE_LENGTH))
model.add(LSTM(state_size, return_sequences=True))  # return_sequences True as we want to feed the hidden state inputs to our following layers
model.add(Flatten())
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data, labels_val),
          callbacks=[early_stopping])

#### Multi-Layer LSTM Model:
Similarly for the multi layer LSTM model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) For the LSTM layer, we set the state size to 10 and return_sequences to True to output all the hidden states for each timestep of the text sequence. The we flatten the output to pass to the Batch Normalization layer.
2) We use Batch Normalization and Dropout of 20% to prevent overfitting as Batch Normalization helps to stabilize and speed up the training process by normalizing the activations and Dropout encourages a sparser network reliant on more independant neurons. 
3) Then we add the 2nd LSTM layer with a state size of 10.
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [95]:
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length= MAX_SEQUENCE_LENGTH))
model.add(LSTM(10, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(LSTM(10))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model.summary()

model.compile(loss="categorical_crossentropy",
                   optimizer='adam',
                   metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data,labels_val),
          callbacks=[early_stopping])



Epoch 1/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 1s/step - accuracy: 0.2971 - loss: 1.9919 - val_accuracy: 0.4172 - val_loss: 1.9942
Epoch 2/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1s/step - accuracy: 0.4634 - loss: 1.5367 - val_accuracy: 0.5049 - val_loss: 1.6462
Epoch 3/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1s/step - accuracy: 0.5274 - loss: 1.3789 - val_accuracy: 0.4416 - val_loss: 1.5004
Epoch 4/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1s/step - accuracy: 0.5593 - loss: 1.2928 - val_accuracy: 0.4336 - val_loss: 1.4496
Epoch 5/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 1s/step - accuracy: 0.5928 - loss: 1.2093 - val_accuracy: 0.4331 - val_loss: 1.4401
Epoch 6/20
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 1s/step - accuracy: 0.6351 - loss: 1.1263 - val_accuracy: 0.4506 - val_loss: 1.4478
Epoch 7/20
[1m47/47[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x167ed4dfa10>

### Embeddings
Now we move on to testing the differnet embeddings

#### On the fly embeddings
For the on the fly embeddings model we take a similar structure that we will also use for the pretrained embeddings:
1) We use on the fly embeddings of size 100 for our tokenized text
2) We use GlobalAveragePooling1D to reduce the dimensionality of the embeddings to feed the output to the following Dense layers.
2) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
3) Then we use a dense layer of 64 connected neurons with the relu activation function
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [24]:
model = Sequential([
    Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH),
    GlobalAveragePooling1D(),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train, epochs=20, validation_data=(val_data, labels_val), callbacks=[early_stopping])



Epoch 1/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 25ms/step - accuracy: 0.5050 - loss: 1.4378 - val_accuracy: 0.5800 - val_loss: 1.2244
Epoch 2/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 31ms/step - accuracy: 0.5842 - loss: 1.2111 - val_accuracy: 0.5902 - val_loss: 1.1610
Epoch 3/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 29ms/step - accuracy: 0.6283 - loss: 1.0943 - val_accuracy: 0.6173 - val_loss: 1.1213
Epoch 4/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 25ms/step - accuracy: 0.6481 - loss: 1.0411 - val_accuracy: 0.6184 - val_loss: 1.1158
Epoch 5/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 27ms/step - accuracy: 0.6702 - loss: 0.9649 - val_accuracy: 0.6229 - val_loss: 1.1224
Epoch 6/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 27ms/step - accuracy: 0.6869 - loss: 0.9178 - val_accuracy: 0.6189 - val_loss: 1.1345
Epoc

<keras.src.callbacks.history.History at 0x1e406865bd0>

We save this model as one of our 2 best models

In [None]:
from tensorflow.keras.models import save_model

save_model(model, 'on_the_fly_embeddings_dropout_only_lyrics.h5')

#### Pre trained embeddings
For the pretrained embeddings, we load a pretrained model from tensorflow hub that gives embeddings in 50 dimensions, which we then train on our training and validation lyrics.

In [88]:
import tensorflow_hub as hub

embeding_model = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")
embeddings_train = embeding_model(train.Lyrics)
embeddings_val = embeding_model(val.Lyrics)




















For the pre trained embeddings model we use a similar structure:
1) We use a dense layer of 64 connected neurons with the relu activation function
2) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons.
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data that has been embedded using the pre trained embeddings, providing the batch size, number of epochs, the validation data that has also been embedded using the pre trained embeddings and the early stopping callbacks

In [92]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(50,)),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(embeddings_train, labels_train, 
          epochs=20, 
          validation_data=(embeddings_val, labels_val), 
          callbacks=[early_stopping])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.5003 - loss: 1.3976 - val_accuracy: 0.5523 - val_loss: 1.2601
Epoch 2/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.5480 - loss: 1.2493 - val_accuracy: 0.5564 - val_loss: 1.2413
Epoch 3/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5507 - loss: 1.2294 - val_accuracy: 0.5613 - val_loss: 1.2269
Epoch 4/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.5531 - loss: 1.2232 - val_accuracy: 0.5637 - val_loss: 1.2212
Epoch 5/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5563 - loss: 1.2202 - val_accuracy: 0.5656 - val_loss: 1.2198
Epoch 6/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5606 - loss: 1.2096 - val_accuracy: 0.5663 - val_loss: 1.2118
Epoch 7/20
[1m1

<keras.src.callbacks.history.History at 0x167a08661d0>

We also save this 2nd model as it is the model with the least overfitting:

In [94]:
save_model(model, 'pretrained_embeddings_only_lyrics.h5')



#### Traditional text encoding approach

Now we compare the previous models to a more traditional text approach:

We use TF-IDF to transform the text into a numerical representation. Then we feed that embedding to a Logistic Regression Model used for classification

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

tfidf_vectorizer = TfidfVectorizer(max_features=5000) #we define the TF-IDFVectorizer using a maximum features of 5000
train_tfidf = tfidf_vectorizer.fit_transform(train.Lyrics)#we fit and transform the training and validation lyrics
val_tfidf = tfidf_vectorizer.transform(val.Lyrics)

clf_tfidf = LogisticRegression(max_iter=1000) #we use the Logistic Regression Model and set the max-iter parameter to speed up training
clf_tfidf.fit(train_tfidf, train_labels)
preds_tfidf = clf_tfidf.predict(val_tfidf)#we make predictions on our validation set
accuracy_tfidf = accuracy_score(val_labels, preds_tfidf)#we calculate the accuracy of those predictions
print(f"TF-IDF Accuracy: {accuracy_tfidf}")

TF-IDF Accuracy: 0.6068412889884542


### CNN for Text Classification

#### CNNs with same kernel sizes:
For the CNN with same kernel size model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features.
7) Then we flatten the output and feed it to a dense layer of 50 connected neurons using the relu activation function
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [None]:
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=30, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss="categorical_crossentropy",
                   optimizer='adam',
                   metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data,labels_val),
          callbacks=[early_stopping])

#### CNNs with different kernel sizes:
For the CNN with different kernel size model, we have a nearly identical architecture. We just vary the kernel size for each of the convolutional layers, starting from 3 to 5, to capture more general features in the further layers:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features, varying the kernel sizes going from 3 to 5, to capture more general features in the further layers
7) Then we flatten the output and feed it to a dense layer of 50 connected neurons using the relu activation function
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [None]:
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2))

model.add(Conv1D(filters=25, kernel_size=4, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2))  

model.add(Conv1D(filters=30, kernel_size=5, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 
model.add(Flatten()) 

model.add(Dense(50, activation='relu'))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model.summary()

model.compile(loss="categorical_crossentropy",
                   optimizer='adam',
                   metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data,labels_val),
          callbacks=[early_stopping])

#### CNN as an additional layer before a LSTM solution:
For the CNN was an additional layer before a LSTM solution, we keep the same kernel size as that performs better than the differnet kernel sizes. We try 2 different versions, the first being:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features
7) Then we add the LSTM model with a state size of 10, followed by Batch Normalization and Dropout to prevent overfitting
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [None]:
#1st version
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 

model.add(Conv1D(filters=30, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 

model.add(LSTM(10))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model.summary()

model.compile(loss="categorical_crossentropy",
                   optimizer='adam',
                   metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data,labels_val),
          callbacks=[early_stopping])

For the second version:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features
7) Then we flatten and resahpe the output to pass it to the LSTM model with a state size of 10, followed by Batch Normalization and Dropout to prevent overfitting
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [None]:
#2nd version
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=30, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Reshape((1050, 1)))

model.add(LSTM(10))
model.add(BatchNormalization())
model.add(Dropout(0.2)) 

model.add(Dense(num_classes, activation='softmax'))

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_data, labels_train,
          batch_size=batch_size,
          epochs=20,
          validation_data=(val_data,labels_val),
          callbacks=[early_stopping])

#### Comparison to Non-Neural Methods

Now we compare to traditional ML models. We use the Decision Tree Classifier and perform a grid search to find the best hyperparameters.

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [4, 8],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required for each leaf node
}

model = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')#we run a 5 fold cross validation grid search to find the best hyperparameters

grid_search.fit(train_data, labels_train)

predictions = grid_search.predict(val_data)#we make the predictions on the validation set

We get the accuracy of our best model on the validation set, the best hyperparameters and the best score obtained on our trainset during the grid search

In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels_val, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.34654489057384114


In [51]:
print(grid_search.best_params_)

{'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [None]:
print(grid_search.best_score_)

### Models based on Lyrics and Artist

Now we will use the Lyrics and the Artist to predict the song genre. We repeat the similar preprocessing steps as previously, adding the Artist information 

In [21]:
sub_df2 = df.drop(["Song", "Language"], axis= 1)
sub_df2

Unnamed: 0,Artist,Genre,Lyrics
180931,big daddy,Rock,I know there's pain\nWhy do lock yourself up i...
269990,aqualung,Rock,swept away\nby the wonder of it all\nso amazed...
275612,george ezra,Indie,"Let me tell you about my best friend, he got h..."
195014,we came as romans,Metal,Don't catch me at the wrong time\nOr you will ...
26829,jimmy eat world,Rock,She's perfect in her own way.\nSmoke rings ris...
...,...,...,...
81372,lil wayne,Pop,[Intro: Pete Ross & Lil Wayne]\nIs it true you...
94338,bee gees,Rock,Ev'rything I want I got\nAnd I got you girl\nY...
9830,camp rock,Pop,Last year is old news\nI'm breaking out my six...
33110,lulu santos,Rock,Você teima!\nVocê teima!\nVocê teima!\nVocê te...


In [22]:
sub_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58036 entries, 180931 to 275495
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Artist  58036 non-null  object
 1   Genre   58036 non-null  object
 2   Lyrics  58027 non-null  object
dtypes: object(3)
memory usage: 1.8+ MB


We remove Nas in the dataframe

In [23]:
sub_df2.dropna(inplace=True)

We preprocess the text by removing stop words, punctuation and converting the text to lowercase

In [24]:
stop=stopwords.words("english")
sub_df2["Lyrics"] = sub_df2["Lyrics"].apply(lambda x: " ".join(word for word in x.split() if word not in stop))
sub_df2["Artist"] = sub_df2["Artist"].apply(lambda x: " ".join(word for word in x.split() if word not in stop))

In [26]:
sub_df2["Lyrics"]=sub_df2["Lyrics"].apply(lambda x:(re.sub(r"\n",' ',str(x))))
sub_df2["Lyrics"]=sub_df2["Lyrics"].apply(lambda x:(re.sub( r"[^\w\s]",'',str(x))))
sub_df2["Artist"]=sub_df2["Artist"].apply(lambda x:(re.sub( r"[^\w\s]",'',str(x))))

In [27]:
sub_df2["Lyrics"]=sub_df2["Lyrics"].apply(lambda x: str(x).lower())
sub_df2["Artist"]=sub_df2["Artist"].apply(lambda x: str(x).lower())

In [28]:
sub_df2

Unnamed: 0,Artist,Genre,Lyrics
180931,big daddy,Rock,i know theres pain why lock chains no one chan...
269990,aqualung,Rock,swept away wonder amazed never saw coming left...
275612,george ezra,Indie,let tell best friend got hair knees he gets al...
195014,came romans,Metal,dont catch wrong time or feel wrath the one i ...
26829,jimmy eat world,Rock,shes perfect way smoke rings rising winter gre...
...,...,...,...
81372,lil wayne,Pop,intro pete ross lil wayne is true performed w...
94338,bee gees,Rock,evrything i want i got and i got girl you real...
9830,camp rock,Pop,last year old news im breaking six string and ...
33110,lulu santos,Rock,você teima você teima você teima você teima e ...


We split the dataset into train and validation sets

In [29]:
train2, val2 =train_test_split(sub_df2, test_size=0.2)

In [30]:
print(f"train:{train2.shape}")
print(f"validation:{val2.shape}")

train:(46421, 3)
validation:(11606, 3)


We tokenize the lyrics for the train and validation and add padding to the sequences to ensure they are all the same legth

In [31]:
max_nb_words=20000
tokenizer= Tokenizer(num_words=max_nb_words)
tokenizer.fit_on_texts(train2.Lyrics)
train_sequences2 = tokenizer.texts_to_sequences(train.Lyrics)
val_sequences2 = tokenizer.texts_to_sequences(val.Lyrics)

In [32]:
train_data2=pad_sequences(train_sequences2, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
val_data2 = pad_sequences(val_sequences2, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print(train_data2.shape)
print(val_data2.shape)

(46421, 300)
(11606, 300)


We encode the labels and then apply one hot encoding to get the one hot encoded vectors given to the models

In [33]:
train_labels2 = train2["Genre"]
val_labels2 = val2["Genre"]

In [34]:
le= LabelEncoder()
le.fit(train_labels2)

train_labels2=le.transform(train_labels2)
val_labels2=le.transform(val_labels2)

print(le.classes_)
print(np.unique(train_labels2, return_counts=True))
print(np.unique(val_labels2, return_counts=True))

['Country' 'Electronic' 'Folk' 'Hip-Hop' 'Indie' 'Jazz' 'Metal' 'Pop'
 'R&B' 'Rock']
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([  333,   341,  1370,   365,  1385,  2230,  3206, 17346,   412,
       19433], dtype=int64))
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([  66,   86,  363,   92,  349,  543,  853, 4225,  104, 4925],
      dtype=int64))


In [35]:
labels_train2 = to_categorical(np.asarray(train_labels2))
labels_val2 = to_categorical(np.asarray(val_labels2))

print('Shape of data tensor:', train_data2.shape)
print('Shape of label tensor:', labels_train2.shape)
print('Shape of label tensor:', labels_val2.shape)

Shape of data tensor: (46421, 300)
Shape of label tensor: (46421, 10)
Shape of label tensor: (11606, 10)


Now for the Artist feature, we manually encode the artist to a numerical value

In [36]:
from collections import defaultdict

artist_name_mapping = defaultdict(int)
artist_count = 0

for name in train2.Artist: #we save the Artist count in the dictionary
    if name not in artist_name_mapping:
        artist_name_mapping[name] = artist_count
        artist_count += 1


def encode_artist_names(artist_list, artist_name_mapping):
    encoded_artists = []
    for name in artist_list: #for each artist in the list encode it by assigning the count value
        encoded_artists.append(artist_name_mapping.get(name, -1))  # -1 if we have an unknown artists
    return encoded_artists

train_artist_encoded = encode_artist_names(train2.Artist, artist_name_mapping)
val_artist_encoded = encode_artist_names(val2.Artist, artist_name_mapping)
train_artist_encoded = np.asarray(train_artist_encoded).reshape(-1, 1)#convert to np.array and reshape to pass as input to our models
val_artist_encoded = np.asarray(val_artist_encoded).reshape(-1, 1)

In [37]:
print(train_artist_encoded.shape)
print(val_artist_encoded.shape)
print(train_data.shape)
print(val_data.shape)

(46421, 1)
(11606, 1)
(46421, 300)
(11606, 300)


### RNN Variants
#### Basic RNN Model:

For the simple RNN model:
1) We use on the fly embeddings of size 100 for our tokenized lyrics
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input and finally concatenate it with the on the fly embeddings for the lyrics
2) We then use a SimpleRNN layer, giving the state size of 10
2) We use Batch Normalization and Dropout of 20% to prevent overfitting as Batch Normalization helps to stabilize and speed up the training process by normalizing the activations and Dropout encourages a sparser network reliant on more independant neurons. 
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data(the tokenized lyrics and the encoded artist), providing the batch size, number of epochs, the validation data((the tokenized lyrics and the encoded artist)) and the early stopping callbacks

In [78]:
model = Sequential()
model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
artist_input = Input(shape=(1,), name='artist_input')
artist_repeated = RepeatVector(300)(artist_input)
artist_reshaped = Reshape((300, 1))(artist_repeated)
concatenated = Concatenate(axis=-1)([model.output, artist_reshaped])

model.add(SimpleRNN(10))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model = Model(inputs=[model.input, artist_input], outputs=model.output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_input (InputLaye  [(None, 300)]                0         []                            
 r)                                                                                               
                                                                                                  
 embedding (Embedding)       (None, 300, 100)             2000000   ['embedding_input[0][0]']     
                                                                                                  
 simple_rnn (SimpleRNN)      (None, 10)                   1110      ['embedding[0][0]']           
                                                                                                  
 batch_normalization (Batch  (None, 10)                   40        ['simple_rnn[0][0]']    

<keras.src.callbacks.History at 0x2b402aa5650>

#### Single Layer LSTM Model:
Similarly for the signle layer LSTM model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input and finally concatenate it with the on the fly embeddings for the lyrics
2) For the LSTM layer, we set the state size and return_sequences to True to output all the hidden states for each timestep of the text sequence. The we flatten the output to pass to the Batch Normalization layer.
2) We use Batch Normalization and Dropout of 20% to prevent overfitting as Batch Normalization helps to stabilize and speed up the training process by normalizing the activations and Dropout encourages a sparser network reliant on more independant neurons. 
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data(the tokenized lyrics and the encoded artist), providing the batch size, number of epochs, the validation data(the tokenized lyrics and the encoded artist) and the early stopping callbacks

In [79]:
model = Sequential()

model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
artist_input = Input(shape=(1,), name='artist_input')
artist_repeated = RepeatVector(300)(artist_input)
artist_reshaped = Reshape((300, 1))(artist_repeated)
concatenated = Concatenate(axis=-1)([model.output, artist_reshaped])

model.add(LSTM(state_size, return_sequences=True)) 
model.add(Flatten())
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model = Model(inputs=[model.input, artist_input], outputs=model.output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_1_input (InputLa  [(None, 300)]                0         []                            
 yer)                                                                                             
                                                                                                  
 embedding_1 (Embedding)     (None, 300, 100)             2000000   ['embedding_1_input[0][0]']   
                                                                                                  
 lstm (LSTM)                 (None, 300, 10)              4440      ['embedding_1[0][0]']         
                                                                                                  
 flatten (Flatten)           (None, 3000)                 0         ['lstm[0][0]']          

<keras.src.callbacks.History at 0x2b4080541d0>

#### Multi-Layer LSTM Model:
Similarly for the multi layer LSTM model:

1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input and finally concatenate it with the on the fly embeddings for the lyrics
2) For the LSTM layer, we set the state size to 10 and return_sequences to True to output all the hidden states for each timestep of the text sequence. The we flatten the output to pass to the Batch Normalization layer.
3) We use Batch Normalization and Dropout of 20% to prevent overfitting as Batch Normalization helps to stabilize and speed up the training process by normalizing the activations and Dropout encourages a sparser network reliant on more independant neurons.
4) Then we add the 2nd LSTM layer with a state size of 10.
5) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
6) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
7) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
8) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
9) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [None]:
model = Sequential()

model.add(Embedding(max_nb_words, 100))
artist_input = Input(shape=(1,), name='artist_input')
artist_repeated = RepeatVector(300)(artist_input)
artist_reshaped = Reshape((300, 1))(artist_repeated)
concatenated = Concatenate(axis=-1)([model.output, artist_reshaped])

model.add(LSTM(10, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(LSTM(10))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model = Model(inputs=[model.input, artist_input], outputs=model.output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])

### Embeddings
#### On the fly embeddings
For the on the fly embeddings model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input. 
2) We use GlobalAveragePooling1D to reduce the dimensionality of the embeddings to feed the output to the following Dense layers.
2) Then we concatenate the lyrics embeddings and the artist input
3) Then we use a dense layer of 64 connected neurons with the relu activation function
3) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons.
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [107]:
lyrics_input = Input(shape=(300,), name='lyrics_input')
artist_input = Input(shape=(train_artist_encoded.shape[1],), name='artist_input')

embedding_layer = Embedding(max_nb_words, 100, input_length=300)(lyrics_input)
global_avg_pooling_lyrics = GlobalAveragePooling1D()(embedding_layer)

concatenated = Concatenate(axis=-1)([global_avg_pooling_lyrics, artist_input])

dense1 = Dense(64, activation='relu')(concatenated)
dropout1 = Dropout(0.2)(dense1) 
output_layer = Dense(num_classes, activation='softmax')(dropout1)

model = Model(inputs=[lyrics_input, artist_input], outputs=output_layer)

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])





Epoch 1/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 29ms/step - accuracy: 0.3495 - loss: 19.2723 - val_accuracy: 0.3502 - val_loss: 1.8395
Epoch 2/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 34ms/step - accuracy: 0.3930 - loss: 2.0905 - val_accuracy: 0.4178 - val_loss: 2.2095
Epoch 3/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 38ms/step - accuracy: 0.3965 - loss: 2.0586 - val_accuracy: 0.4170 - val_loss: 2.0609
Epoch 4/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 35ms/step - accuracy: 0.4123 - loss: 1.9551 - val_accuracy: 0.4168 - val_loss: 1.6852
Epoch 5/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 35ms/step - accuracy: 0.4299 - loss: 1.9100 - val_accuracy: 0.3951 - val_loss: 1.8047
Epoch 6/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 36ms/step - accuracy: 0.4546 - loss: 1.7957 - val_accuracy: 0.3803 - val_loss: 2.3049
Epo

<keras.src.callbacks.history.History at 0x2b7c714f8d0>

#### Pretrained embeddings

As previously done, we load the pre trained embeddings model from Tensorflow hub and use it to create our embeddings for the Lyrics and the Artist.

In [103]:
import tensorflow_hub as hub

embeding_model = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")
embeddings_train2 = embeding_model(train2.Lyrics)
embeddings_val2 = embeding_model(val2.Lyrics)
embeddings_artist_train = embeding_model(train2.Artist)
embeddings_artist_val = embeding_model(val2.Artist)

For the pre trained embeddings model:
1) We concatenate the artist and the lyrics embeddings gotten from the pre trained model.
2) Then we use a dense layer of 64 connected neurons with the relu activation function
3) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons.
3) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
4) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
5) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
6) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
7) We fit the model on the training data that has been embedded using the pre trained embeddings, providing the batch size, number of epochs, the validation data that has also been embedded using the pre trained embeddings and the early stopping callbacks

In [104]:
lyrics_input = Input(shape=(50,), name='lyrics_input')
artist_input = Input(shape=(50,), name='artist_input')

concatenated = Concatenate(axis=-1)([lyrics_input, artist_input])
dense1 = Dense(64, activation='relu')(concatenated)
dropout1 = Dropout(0.5)(dense1)  
output_layer = Dense(num_classes, activation='softmax')(dropout1)

model = Model(inputs=[lyrics_input, artist_input], outputs=output_layer)

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([embeddings_train2, embeddings_artist_train], labels_train2, 
          epochs=20, 
          validation_data=([embeddings_val2, embeddings_artist_val], labels_val2), 
          callbacks=[early_stopping])


Epoch 1/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.4679 - loss: 1.4960 - val_accuracy: 0.5660 - val_loss: 1.1999
Epoch 2/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.5603 - loss: 1.2353 - val_accuracy: 0.5900 - val_loss: 1.1537
Epoch 3/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.5810 - loss: 1.1820 - val_accuracy: 0.5928 - val_loss: 1.1277
Epoch 4/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.5920 - loss: 1.1555 - val_accuracy: 0.6140 - val_loss: 1.1018
Epoch 5/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.5995 - loss: 1.1322 - val_accuracy: 0.6200 - val_loss: 1.0815
Epoch 6/20
[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.6056 - loss: 1.1202 - val_accuracy: 0.6338 - val_loss: 1.0584
Epoch 7/20
[1m

<keras.src.callbacks.history.History at 0x1679eedbfd0>

#### Traditional text approach

We use TF-IDF to transform the text into a numerical representation. Then we feed that embedding to a Logistic Regression Model used for classification

In [80]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)#we define the TF-IDFVectorizer using a maximum features of 5000

combined_lyrics_artist_train = train2.Lyrics + ' ' + train2.Artist#we combine the lyrics and the artist information into a single vector
combined_lyrics_artist_val = val2.Lyrics + ' ' + val2.Artist

tfidf_train2 = tfidf_vectorizer.fit_transform(combined_lyrics_artist_train)#we fit and transform the combined lyrics and artist training and validation sets
tfidf_val2 = tfidf_vectorizer.transform(combined_lyrics_artist_val)

In [44]:
clf_tfidf = LogisticRegression(max_iter=1000)#we use the Logistic Regression Model and set the max-iter parameter to speed up training
clf_tfidf.fit(tfidf_train2, train_labels2)
preds_tfidf2 = clf_tfidf.predict(tfidf_val2)#we make predictions on our validation set
accuracy_tfidf = accuracy_score(val_labels2, preds_tfidf2)#we calculate the accuracy of those predictions
print(f"TF-IDF Accuracy: {accuracy_tfidf}")

TF-IDF Accuracy: 0.622781320006893



### CNN for Text Classification
#### CNNs with same kernel sizes:

For the CNN with same kernel size model:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input and finally concatenate it with the on the fly embeddings for the lyrics
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features.
7) Then we flatten the output and feed it to a dense layer of 50 connected neurons using the relu activation function
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [81]:
model = Sequential()

model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
artist_input = Input(shape=(1,), name='artist_input')
artist_repeated = RepeatVector(300)(artist_input)
artist_reshaped = Reshape((300, 1))(artist_repeated)
concatenated = Concatenate(axis=-1)([model.output, artist_reshaped])

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=30, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Flatten()) 
model.add(Dense(50, activation='relu'))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model = Model(inputs=[model.input, artist_input], outputs=model.output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_3_input (InputLa  [(None, 300)]                0         []                            
 yer)                                                                                             
                                                                                                  
 embedding_3 (Embedding)     (None, 300, 100)             2000000   ['embedding_3_input[0][0]']   
                                                                                                  
 conv1d (Conv1D)             (None, 298, 25)              7525      ['embedding_3[0][0]']         
                                                                                                  
 batch_normalization_3 (Bat  (None, 298, 25)              100       ['conv1d[0][0]']       

<keras.src.callbacks.History at 0x2b40b3fa0d0>

#### CNNs with different kernel sizes:

For the CNN with different kernel size model, we vary the kernel size for each of the convolutional layers, starting from 3 to 5, to capture more general features in the further layers:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input and finally concatenate it with the on the fly embeddings for the lyrics
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features, varying the kernel sizes going from 3 to 5, to capture more general features in the further layers
7) Then we flatten the output and feed it to a dense layer of 50 connected neurons using the relu activation function
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [83]:
model = Sequential()

model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
artist_input = Input(shape=(1,), name='artist_input')
artist_repeated = RepeatVector(300)(artist_input)
artist_reshaped = Reshape((300, 1))(artist_repeated)
concatenated = Concatenate(axis=-1)([model.output, artist_reshaped])

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 

model.add(Conv1D(filters=25, kernel_size=4, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 

model.add(Conv1D(filters=30, kernel_size=5, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 

model.add(Flatten()) 
model.add(Dense(50, activation='relu'))
model.add(Dense(num_classes, activation=tf.nn.softmax))

model = Model(inputs=[model.input, artist_input], outputs=model.output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_5_input (InputLa  [(None, 300)]                0         []                            
 yer)                                                                                             
                                                                                                  
 embedding_5 (Embedding)     (None, 300, 100)             2000000   ['embedding_5_input[0][0]']   
                                                                                                  
 conv1d_6 (Conv1D)           (None, 298, 25)              7525      ['embedding_5[0][0]']         
                                                                                                  
 batch_normalization_9 (Bat  (None, 298, 25)              100       ['conv1d_6[0][0]']      

<keras.src.callbacks.History at 0x2b458c9c0d0>

#### CNN as an additional layer before a LSTM solution:
For the CNN was an additional layer before a LSTM solution, we keep the same kernel size as that performs better than the differnet kernel sizes. We try 2 different versions, the first being:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we define the input layer for the Artist information and repeat it 300 times to match the lyric embeddings length. Then we reshape the artist input and finally concatenate it with the on the fly embeddings for the lyrics
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features
7) Then we add the LSTM model with a state size of 10, followed by Batch Normalization and Dropout to prevent overfitting
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [84]:
#1st version
model = Sequential()

model.add(Embedding(max_nb_words, 100, input_length=MAX_SEQUENCE_LENGTH))
artist_input = Input(shape=(1,), name='artist_input')
artist_repeated = RepeatVector(300)(artist_input)
artist_reshaped = Reshape((300, 1))(artist_repeated)
concatenated = Concatenate(axis=-1)([model.output, artist_reshaped])

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2))

model.add(Conv1D(filters=25, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2)) 

model.add(Conv1D(filters=30, kernel_size=3, padding='valid', strides=1))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2))  

model.add(LSTM(10))
model.add(BatchNormalization())
model.add(Dropout(0.2))  
model.add(Dense(num_classes, activation=tf.nn.softmax))

model = Model(inputs=[model.input, artist_input], outputs=model.output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_6_input (InputLa  [(None, 300)]                0         []                            
 yer)                                                                                             
                                                                                                  
 embedding_6 (Embedding)     (None, 300, 100)             2000000   ['embedding_6_input[0][0]']   
                                                                                                  
 conv1d_9 (Conv1D)           (None, 298, 25)              7525      ['embedding_6[0][0]']         
                                                                                                  
 batch_normalization_12 (Ba  (None, 298, 25)              100       ['conv1d_9[0][0]']      

<keras.src.callbacks.History at 0x2b4592b41d0>

For the second version:
1) We use on the fly embeddings of size 100 for our tokenized text
2) Then we use a 1 Dimension Convolutional layer, with 25 different filters, a kernel size of 3, no additional padding and a stride of 1 to capture more local features
3) Then we use Batch Normalization to prevent overfitting as it helps to stabilize and speed up the training process by normalizing the data.
4) We use the relu activation function, followed by Max Pooling to reduce the size of our feature maps
5) We use Dropout of 20% to prevent overfitting as it encourages a sparser network reliant on more independant neurons. 
6) We repeat this structure to have 3 blocks of convolutional features
7) Then we flatten the output, concatente it with the artist input and reshape it to pass it to the LSTM model
7) We define LSTM model with a state size of 10, followed by Batch Normalization to prevent overfitting
8) The final layer is a softmax layer as we are in a multiclass classification problem and we want to predcit only one class
9) Similarly we use the categorical cross entropy loss function for the same reason as we have a multiclass classification problem
10) We use the Adam optimizer as it is the best performing and the models are evaluated on the accuracy metric
11) We use the Early Stopping callback to prevent overfitting and speed up the training process, as this monitors the validation loss and stops when there are no signinficant improvements.
12) We fit the model on the training data, providing the batch size, number of epochs, the validation data and the early stopping callbacks

In [87]:
#2nd version
from tensorflow.keras.layers import concatenate

lyrics_input = Input(shape=(300,), name='lyrics_input')
lyrics_embedding = Embedding(max_nb_words, 100, input_length=300)(lyrics_input)

conv1d_1 = Conv1D(filters=25, kernel_size=3, padding='valid', activation='relu')(lyrics_embedding)
batchnorm_1 = BatchNormalization()(conv1d_1)
maxpooling_1 = MaxPooling1D(pool_size=2)(batchnorm_1)

conv1d_2 = Conv1D(filters=25, kernel_size=3, padding='valid', activation='relu')(maxpooling_1)
batchnorm_2 = BatchNormalization()(conv1d_2)
maxpooling_2 = MaxPooling1D(pool_size=2)(batchnorm_2)

conv1d_3 = Conv1D(filters=30, kernel_size=3, padding='valid', activation='relu')(maxpooling_2)
batchnorm_3 = BatchNormalization()(conv1d_3)
maxpooling_3 = MaxPooling1D(pool_size=2)(batchnorm_3)

flatten = Flatten()(maxpooling_3)
artist_input = Input(shape=(1,), name='artist_input')
concatenated = concatenate([flatten, artist_input])
reshaped = Reshape((-1, 1))(concatenated)

lstm = LSTM(10)(reshaped)
batchnorm_lstm = BatchNormalization()(lstm)

output = Dense(num_classes, activation='softmax')(batchnorm_lstm)

model = Model(inputs=[lyrics_input, artist_input], outputs=output)

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit([train_data2, train_artist_encoded], labels_train2,
          batch_size=batch_size,
          epochs=20,
          validation_data=([val_data2, val_artist_encoded], labels_val2),
          callbacks=[early_stopping])

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lyrics_input (InputLayer)   [(None, 300)]                0         []                            
                                                                                                  
 embedding_8 (Embedding)     (None, 300, 100)             2000000   ['lyrics_input[0][0]']        
                                                                                                  
 conv1d_15 (Conv1D)          (None, 298, 25)              7525      ['embedding_8[0][0]']         
                                                                                                  
 batch_normalization_19 (Ba  (None, 298, 25)              100       ['conv1d_15[0][0]']           
 tchNormalization)                                                                          

<keras.src.callbacks.History at 0x2b459c89810>

#### Comparison to Non-Neural Methods

We compare our models to a traditional ML model: Decision Tree Classifier.

First we concatenate our tokenized lyrics and the encoded artist

In [89]:
train_data_with_artist = np.concatenate((train_data2, train_artist_encoded), axis=1)
val_data_with_artist = np.concatenate((val_data2, val_artist_encoded), axis=1)

We do a grid search to find the best hyperparmeters:

In [90]:
param_grid = {
    'max_depth': [4, 8], 
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4], 
}

model = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(train_data_with_artist, labels_train)

predictions = grid_search.predict(val_data_with_artist)# we make the predictions on the validation dataset

We get the accuracy and the best hyperparameters

In [91]:
accuracy = accuracy_score(labels_val, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.29691538859210753


In [92]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.35604486627563137


#### Preprocessing the test dataset to save in google drive

In [64]:
df_test=pd.read_csv('test.csv')

In [65]:
df_test.head()

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id
0,craftsmanship,2005,buck-65,Hip-Hop,Most folks spend their days daydreaming of fin...,8294
1,come-on-out,2012,the-elwins,Indie,Take your cold hands and put them on my face\n...,21621
2,riot,2013,bullet-for-my-valentine,Metal,Are you ready it's time for war\nWe'll break d...,3301
3,that-s-what-girls-do,2007,dream-street,Pop,You ask me why I change the color of my hair\n...,2773
4,believe-in-a-dollar,2012,cassidy,Hip-Hop,Do you believe in magic in a young girl's hear...,16797


We only take the Genre and Lyrics as our best neural network model was with lyrics only

In [66]:
sub_df_test = df_test.drop(["Song","Song year", "Artist", "Track_id"], axis= 1)
sub_df_test

Unnamed: 0,Genre,Lyrics
0,Hip-Hop,Most folks spend their days daydreaming of fin...
1,Indie,Take your cold hands and put them on my face\n...
2,Metal,Are you ready it's time for war\nWe'll break d...
3,Pop,You ask me why I change the color of my hair\n...
4,Hip-Hop,Do you believe in magic in a young girl's hear...
...,...,...
7930,Rock,Tuesday night - 7:30\nI hear a voice on the te...
7931,Metal,Elite forces cloaked in fur un sensitive to pa...
7932,Hip-Hop,[Dr. Dre]\nJourney with me\nInto the mind of a...
7933,Rock,You can a look a hurricane right in the eye.\n...


In [67]:
sub_df_test.dropna(inplace=True)

In [68]:
stop=stopwords.words("english")
sub_df_test["Lyrics"] = sub_df_test["Lyrics"].apply(lambda x: " ".join(word for word in x.split() if word not in stop))

In [69]:
sub_df_test["Lyrics"]=sub_df_test["Lyrics"].apply(lambda x:(re.sub(r"\n",' ',str(x))))
sub_df_test["Lyrics"]=sub_df_test["Lyrics"].apply(lambda x:(re.sub( r"[^\w\s]",'',str(x))))

In [70]:
sub_df_test["Lyrics"]=sub_df_test["Lyrics"].apply(lambda x: str(x).lower())

In [71]:
sub_df_test

Unnamed: 0,Genre,Lyrics
0,Hip-Hop,most folks spend days daydreaming finding clue...
1,Indie,take cold hands put face sharpen axe criminal ...
2,Metal,are ready time war well break fucking doors sm...
3,Pop,you ask i change color hair yeah you ask i nee...
4,Hip-Hop,do believe magic young girls heart how music f...
...,...,...
7930,Rock,tuesday night 730 i hear voice telephone doin...
7931,Metal,elite forces cloaked fur un sensitive pain bur...
7932,Hip-Hop,dr dre journey into mind maniac doomed killer ...
7933,Rock,you look hurricane right eye 1200 people dead ...


In [72]:
max_nb_words=20000
tokenizer= Tokenizer(num_words=max_nb_words)
tokenizer.fit_on_texts(sub_df_test.Lyrics)
test_sequences = tokenizer.texts_to_sequences(sub_df_test.Lyrics)

In [73]:
test_data=pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

print(test_data.shape)


(7935, 300)


In [74]:
labels = sub_df_test["Genre"]

In [75]:
le= LabelEncoder()
le.fit(labels)

test_labels=le.transform(labels)

print(le.classes_)
print(np.unique(test_labels, return_counts=True))

['Country' 'Electronic' 'Folk' 'Hip-Hop' 'Indie' 'Jazz' 'Metal' 'Pop'
 'R&B' 'Rock']
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([ 810,  660,  495,  960,  510,  660,  810, 1110,  510, 1410],
      dtype=int64))


In [76]:
labels_test = to_categorical(np.asarray(test_labels))

In [77]:
test_data

array([[   0,    0,    0, ..., 8916,  941,  942],
       [   0,    0,    0, ...,   46,   88,   25],
       [   0,    0,    0, ...,  538,   10, 1625],
       ...,
       [ 268, 2403, 2979, ...,  152, 1715,   54],
       [   0,    0,    0, ...,   38,  700,  785],
       [   0,    0,    0, ...,  115,  355,  412]])

In [78]:
labels_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [69]:
np.save('test_data.npy', test_data)

In [70]:
np.save('labels_test.npy', labels_test)

Our 2nd best model was the Pre trained Embeddings model, so we also have apply the pretrained embeddings on the test set and save them to Google Drive:

In [98]:
embeding_model = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")
embeddings_test = embeding_model(sub_df_test.Lyrics)

In [99]:
embeddings_test

<tf.Tensor: shape=(7935, 50), dtype=float32, numpy=
array([[ 1.074597  , -0.5630959 , -0.63586074, ..., -0.27870524,
         1.1905441 ,  0.56676865],
       [ 0.8727184 , -0.20948684, -0.8824672 , ..., -0.6039505 ,
         0.7928976 ,  0.17126192],
       [ 0.8540335 , -0.65630573, -1.1608652 , ...,  0.10664905,
         1.0214158 , -0.0730844 ],
       ...,
       [ 1.1986363 , -1.87084   , -1.2815093 , ...,  0.79785144,
         2.1023715 ,  0.3782531 ],
       [ 0.48767015, -0.36326748, -0.82148474, ...,  0.07695395,
         0.5876057 , -0.02761501],
       [ 0.49153772,  0.4132342 , -0.5963266 , ..., -0.42785105,
         0.00440266,  0.81977504]], dtype=float32)>

In [100]:
np.save('embeddings_test.npy', embeddings_test)