# Lyrics Classifier research notebook
This notebook attempts to classify the genre of songs based on their lyrics.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install torch
!pip install accelerate -U

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from transformers import DistilBertTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from datasets import Dataset
import evaluate
import torch

## Data

Lyrics - [Kaggle](https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset/)
**BIG** dataset containing nothing but the song name and lyrics

In [None]:
lyrics_data = pd.read_csv('/content/drive/MyDrive/datasets/lyrics_data.csv')
lyrics_data = lyrics_data.drop(columns=['link'])
print('lyrics shape:', lyrics_data.shape)
lyrics_data.columns

lyrics shape: (57650, 3)


Index(['artist', 'song', 'text'], dtype='object')

Meta dataset #1 - [Kaggle](https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset)

Detailed track information.  
We have to clean it up.

In [None]:
meta_data = pd.read_csv('/content/drive/MyDrive/datasets/meta_data_1.csv')
meta_data = meta_data.drop_duplicates(subset=['track_name', 'artists'])
meta_data = meta_data.drop(columns=['track_id', 'album_name', 'time_signature', 'popularity', 'explicit', 'mode'])
meta_data = meta_data.drop(meta_data.columns[0], axis=1)
meta_data = meta_data.rename(columns={'track_name': 'song', 'artists': 'artist'})
print('meta #1 shape:', meta_data.shape)
meta_data.columns

meta #1 shape: (81344, 14)


Index(['artist', 'song', 'duration_ms', 'danceability', 'energy', 'key',
       'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'track_genre'],
      dtype='object')

Meta dataset #2 - [Kaggle](https://www.kaggle.com/datasets/salvatorerastelli/spotify-and-youtube)

Pretty much the same as the first meta dataset, I hope it contains more data. (probably not...)

In [None]:
meta_data2 = pd.read_csv('/content/drive/MyDrive/datasets/meta_data_2.csv')
meta_data2 = (meta_data2.drop_duplicates(subset=['Track', 'Artist'])
              .drop(columns=['Url_spotify', 'Album', 'Album_type', 'Uri', 'Url_youtube', 'Channel', 'Views', 'Likes', 'Comments', 'Description', 'Licensed', 'official_video', 'Stream', 'Title'])
              .drop(meta_data2.columns[0], axis=1)
              .rename(columns={'Track': 'song'})
              )
meta_data2.columns = map(str.lower, meta_data2.columns)
meta_data2.columns

Index(['artist', 'song', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms'],
      dtype='object')

# Combining datasets

We have our separate datasets: one for lyrics, two for other data  
Now we try to join them together on the song names.

In [None]:
merge1 = pd.merge(lyrics_data, meta_data, on=['artist', 'song'])
merge2 = pd.merge(lyrics_data, meta_data2, on=['artist', 'song'])
print('merge1 size:', merge1.shape)
print('merge2 size:', merge2.shape)

merge1 size: (1127, 15)
merge2 size: (1037, 14)


In [None]:
concat = merge1#pd.concat([merge1, merge2])
# concat = concat.drop_duplicates(subset=['artist', 'song'])
# print('combined unique songs with lyrics and metadata:', concat.shape)
# concat

Narrowing down the meaningless made up words into real genres

In [None]:
mapping = {
    'hard-rock': 'rock',
    'psych-rock': 'rock',
    'j-rock': 'rock',
    'goth': 'rock',
    'alt-rock': 'rock',
    'german': 'rock',
    'synth-pop': 'pop',
    'power-pop': 'pop',
    'indie-pop': 'pop',
    'j-pop': 'pop',
    'swedish': 'pop',
    'british': 'pop',
    'piano': 'pop',
    'latin': 'pop',
    'electro': 'pop',
    'electronic': 'pop',
    'world-music': 'pop',
    'edm': 'pop',
    'grunge': 'metal',
    'death-metal': 'metal',
    'black-metal': 'metal',
    'metalcore': 'metal',
    'classical': 'metal',
    'hardcore': 'metal',
    'rockabilly': 'rock-n-roll',
    'r-n-b': 'rock-n-roll',
    'j-dance': 'dance',
    'garage': 'edm',
    'dancehall': 'reggae',
    'ska': 'reggae',
    'dub': 'reggae',
    'children': 'reggae',
    'bluegrass': 'folk',
    'punk-rock': 'punk',
    'alternative': 'punk',
    'emo': 'punk',
    'guitar': 'punk',
    'funk': 'blues',
    'singer-songwriter': 'blues',
    'honky-tonk': 'country'
}

def collapse_genres(genre):
    if genre in mapping:
        return mapping[genre]
    return genre

print('genres before collapsing:', len(concat['track_genre'].unique()))
concat['track_genre'] = concat['track_genre'].apply(collapse_genres)
print('genres after collapsing:', len(concat['track_genre'].unique()))

genres before collapsing: 59
genres after collapsing: 20


Discard meaningless values

In [None]:
count = Counter(concat['track_genre'])

pruned = concat[concat.apply(lambda x: True if count[x['track_genre']] > 50 else False, axis=1)]
print('shape:', pruned.shape)
pruned['track_genre'].unique()

shape: (831, 15)


array(['pop', 'rock', 'metal', 'blues', 'country'], dtype=object)

Not the optimal amount of data to train with...   

Let's chop up each song into verses.

In [None]:
def clean_text(t: str):
    return t.replace('\r\n', ' ').strip()

verse_data = pruned.copy()
verse_data['new_text'] = verse_data['text'].str.split('\r\n  \r\n')
verse_data = verse_data.drop(['text'], axis=1).explode('new_text').rename(columns={'new_text': 'text'})
verse_data['text'] = verse_data['text'].apply(clean_text)
verse_data

Unnamed: 0,artist,song,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,text
0,ABBA,"Andante, Andante",278213,0.523,0.361,10,-10.718,0.0238,0.6840,0.000348,0.0671,0.380,101.887,pop,"Take it easy with me, please Touch me gently..."
0,ABBA,"Andante, Andante",278213,0.523,0.361,10,-10.718,0.0238,0.6840,0.000348,0.0671,0.380,101.887,pop,Make your fingers soft and light Let your bo...
0,ABBA,"Andante, Andante",278213,0.523,0.361,10,-10.718,0.0238,0.6840,0.000348,0.0671,0.380,101.887,pop,I'm your music (I am your music and I am you...
0,ABBA,"Andante, Andante",278213,0.523,0.361,10,-10.718,0.0238,0.6840,0.000348,0.0671,0.380,101.887,pop,There's a shimmer in your eyes Like the feel...
0,ABBA,"Andante, Andante",278213,0.523,0.361,10,-10.718,0.0238,0.6840,0.000348,0.0671,0.380,101.887,pop,I'm your music (I am your music and I am you...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,Within Temptation,Stand My Ground,267986,0.271,0.866,5,-4.072,0.0578,0.0489,0.000762,0.1160,0.127,175.665,rock,Stand my ground I won't give in No more de...
1124,Within Temptation,Stand My Ground,267986,0.271,0.866,5,-4.072,0.0578,0.0489,0.000762,0.1160,0.127,175.665,rock,All I know for sure is that I'm trying I wil...
1124,Within Temptation,Stand My Ground,267986,0.271,0.866,5,-4.072,0.0578,0.0489,0.000762,0.1160,0.127,175.665,rock,"Stand my ground I won't give in, (I won't gi..."
1124,Within Temptation,Stand My Ground,267986,0.271,0.866,5,-4.072,0.0578,0.0489,0.000762,0.1160,0.127,175.665,rock,Stand my ground I won't give in No more de...


I expected more

In [None]:
stripped = verse_data[['track_genre', 'text']].rename(columns={'track_genre': 'label'})
stripped

Unnamed: 0,label,text
0,pop,"Take it easy with me, please Touch me gently..."
0,pop,Make your fingers soft and light Let your bo...
0,pop,I'm your music (I am your music and I am you...
0,pop,There's a shimmer in your eyes Like the feel...
0,pop,I'm your music (I am your music and I am you...
...,...,...
1124,rock,Stand my ground I won't give in No more de...
1124,rock,All I know for sure is that I'm trying I wil...
1124,rock,"Stand my ground I won't give in, (I won't gi..."
1124,rock,Stand my ground I won't give in No more de...


Create ID mappings

In [None]:
label2id = {k: v for v, k in enumerate(stripped['label'].unique())}
id2label = {v: k for k, v in label2id.items()}
print(id2label)
print(label2id)

{0: 'pop', 1: 'rock', 2: 'metal', 3: 'blues', 4: 'country'}
{'pop': 0, 'rock': 1, 'metal': 2, 'blues': 3, 'country': 4}


In [None]:
def map_to_id(genre):
  return label2id[genre]

stripped_with_id = stripped.copy()
stripped_with_id['label'] = stripped_with_id['label'].apply(map_to_id)
stripped_with_id

Unnamed: 0,label,text
0,0,"Take it easy with me, please Touch me gently..."
0,0,Make your fingers soft and light Let your bo...
0,0,I'm your music (I am your music and I am you...
0,0,There's a shimmer in your eyes Like the feel...
0,0,I'm your music (I am your music and I am you...
...,...,...
1124,1,Stand my ground I won't give in No more de...
1124,1,All I know for sure is that I'm trying I wil...
1124,1,"Stand my ground I won't give in, (I won't gi..."
1124,1,Stand my ground I won't give in No more de...


# Model

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def preprocess(data):
  return tokenizer(data['text'], truncation=True, padding=True, max_length=50)

In [None]:
dataset = Dataset.from_pandas(stripped_with_id)
tokenized = dataset.map(preprocess, batched=True)

{'label': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None)}


Map:   0%|          | 0/5720 [00:00<?, ? examples/s]

split data into train and test datasets

In [None]:
split = tokenized.train_test_split(test_size=0.2)
split

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 4576
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1144
    })
})

Accuracy metrics

In [None]:
accuracy = evaluate.load('accuracy')

In [None]:
def compute_metrics(eval):
  predictions, labels = eval
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train it

In [None]:
training_args = TrainingArguments(
    output_dir="lyrics-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.245257,0.479021
2,1.238500,1.094535,0.562937
3,1.238500,1.050081,0.59528
4,0.783900,1.029619,0.621503


TrainOutput(global_step=1144, training_loss=0.9586711430049443, metrics={'train_runtime': 166.8234, 'train_samples_per_second': 109.721, 'train_steps_per_second': 6.858, 'total_flos': 236798143296000.0, 'train_loss': 0.9586711430049443, 'epoch': 4.0})

# Save the trained model

In [None]:
!mkdir model

In [None]:
model_path = 'model/lyrics-classifier.pth'
model_path_bin = 'model/lyrics-classifier.bin'
tokenizer_path = 'model/tokenizer'

torch.save(model.state_dict(), model_path)
torch.save(model.state_dict(), model_path_bin)
tokenizer.save_pretrained(tokenizer_path)

('model/tokenizer/tokenizer_config.json',
 'model/tokenizer/special_tokens_map.json',
 'model/tokenizer/vocab.txt',
 'model/tokenizer/added_tokens.json')

Test inference

In [None]:
test_tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)
test_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)
test_model.load_state_dict(torch.load(model_path))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
classifier = pipeline('text-classification', model=test_model, tokenizer=test_tokenizer)

In [None]:
# Imagine Dragons - Follow You
test = """
You know I got your number, number all night
I'm always on your team, I got your back, alright
Taking those, taking those losses if it treats you right
I wanna put you into the spotlight
If the world would only know
"""

In [None]:
classifier(test)

[{'label': 'pop', 'score': 0.8231419920921326}]

In [None]:
# AC/DC - Back in black
test2 = """
Back in black, I hit the sack
I've been too long, I'm glad to be back
Yes, I'm let loose from the noose
That's kept me hanging about
I'm just looking at the sky 'cause it's getting me high
"""
classifier(test2)

[{'label': 'rock', 'score': 0.4482513964176178}]