# Imports

In [5]:
import pandas as pd
import torch
import torch.nn as nn

import pandas as pd

T = torch.Tensor
M = nn.Module


from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)
import re
from pprint import pprint

# Tokenizing Lyrics

In [2]:
df = pd.read_csv("../data/taylor_lyrics_2.csv")
df.shape

(530, 2)

In [3]:
df.head()

Unnamed: 0,title,lyrics
0,Mine,"[Verse 1]\nYou were in college, working part-t..."
1,Back to December,[Verse 1]\nI'm so glad you made time to see me...
2,Thug Story,"[Intro: T-Pain]\nHey Hey, T-Swizzle (T-Swizzle..."
3,Speak Now,[Verse 1]\nI am not the kind of girl\nWho shou...
4,Haunted,[Verse 1]\nYou and I walk a fragile line\nI ha...


We have about 530 full lyrics of songs from Taylor Swift. some of them are duplicated and some are not exactly lyrics but are transcript of interviews etc. I have attemped to remove some duplicates and non-lyrics.

In [104]:
possible_different_version = df["title"].str.contains("Version")
from_the_vault = df["title"].str.contains("Taylors Version From the Vault")

print(f"Possible different version: {possible_different_version.sum()}")
print(f"From the vault: {from_the_vault.sum()}")
# remove possible different version but keep from the vault
to_remove = possible_different_version & ~from_the_vault
print(f"Removing {to_remove.sum()} rows")
df = df[~to_remove]
df.shape

Possible different version: 79
From the vault: 14
Removing 65 rows


(465, 2)

## Preprocessing

Since the lyrics are usually very long, we can not train the model on the full lyrics. We need to create subparts of the lyrics. One way to do this is to create a sliding window of fixed size and slide it over the lyrics. A better way would be to take out the different sections of the lyrics, like chorus, verse, bridge etc. and train the model on these sections. This way the model will learn the structure of the song and will be able to generate lyrics in the same structure. The dataset is organized in a way that the sections names are given in the lyrics. We can use this information to create the subparts. Here is the lyrics of `willow` from the dataset:

In [13]:
print(df.loc[429, "lyrics"])

[Verse 1]
I'm like the water when your ship rolled in that night
Rough on the surface, but you cut through like a knife
And if it was an open-shut case
I never would've known from that look on your face
Lost in your current like a priceless wine

[Chorus]
The more that you say, the less I know
Wherever you stray, I follow
I'm begging for you to take my hand
Wreck my plans, that's my man

[Verse 2]
Life was a willow and it bent right to your wind
Head on the pillow, I could feel you sneakin' in
As if you were a mythical thing
Like you were a trophy or a champion ring
And there was one prize I'd cheat to win

[Chorus]
The more that you say, the less I know
Wherever you stray, I follow
I'm begging for you to take my hand
Wreck my plans, that's my man
You know that my train could take you home
Anywhere else is hollow
I'm begging for you to take my hand
Wreck my plans, that's my man
[Bridge]
Life was a willow and it bent right to your wind
They count me out time and time again
Life was a wi

The following code cells extracts the section name and the lyrics of the section from the lyrics of the song.

In [105]:
def preprocess_lyric(lyric):
    song_section_name_re = re.compile(r"\[([a-zA-Z\s-]*):?\d*:?.*\]")
    song_section_spans = [i.span() for i in song_section_name_re.finditer(lyric)]
    if len(song_section_spans) == 0:
        print("No sections found. Using default section name 'verse'")
        # use \n\n to split
        sections = lyric.split("\n\n")
        song_sections = ["verse"] * len(sections)
        song_with_sections = {
            "sections": [
                {"section": song_section, "lyrics": section.strip()}
                for song_section, section in zip(song_sections, sections)
            ]
        }
        return song_with_sections["sections"]

    song_sections = song_section_name_re.findall(lyric)
    song_lyric_spans = [
        (song_section_spans[i][1], song_section_spans[i + 1][0])
        for i in range(len(song_section_spans) - 1)
    ]
    song_lyric_spans.append((song_section_spans[-1][1], len(lyric))) # last section

    song_with_sections = {
        "sections": [
            {
                "section":song_section.strip(),
                "lyrics": lyric[song_span[0] : song_span[1]].strip(),
            }
            for song_section, song_span in zip(song_sections, song_lyric_spans)
        ]
    }
    return song_with_sections["sections"]

In [106]:
songs_section_wise = {}
for idx, row in df.iterrows():
    try:
        songs_section_wise[row["title"]] = preprocess_lyric(row["lyrics"])
    except Exception as e:
        print(idx)
        raise

No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using default section name 'verse'
No sections found. Using def

In [107]:
import json
songs_file_path = "../data/songs_section_wise.json"
with open(songs_file_path, "w") as f:
    json.dump(songs_section_wise, f, indent=2)

# Tokenization

Now, we will load the dataset and tokenize the lyrics.

In [165]:
def fetch_all_lyrics(songs_file_path):
    with open(songs_file_path, "r") as f:
        songs = json.load(f)
    lyrics = []
    for _, sections in songs.items():

        for section in sections:
            lyrics.append(section["lyrics"])

    return lyrics


lyrics = fetch_all_lyrics(songs_file_path)
len(lyrics)

3544

In [160]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.WordPiece()
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

trainer = trainers.WordPieceTrainer(
    vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train_from_iterator(lyrics, trainer=trainer)

# now save the tokenizer
tokenizer.save("../data/tokenizer_eng_lyrics.json")






In [161]:
lengths = [len(tokenizer.encode(lyric).ids) for lyric in lyrics]
max_len = max(lengths)
max_len

283

In [164]:
lengths = pd.Series(lengths)
lengths.describe()

count    3544.000000
mean       50.093115
std        31.762309
min         0.000000
25%        28.000000
50%        46.000000
75%        64.000000
max       283.000000
dtype: float64