# Imports

In [2]:
import torch
import torch.nn as nn
import pandas as pd

T = torch.Tensor
M = nn.Module


from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)
import json

# Tokenization for Lyric Data

We start by loading the json file containing the lyrics data.

In [3]:
songs_file_path = "../data/songs_section_wise.json"
with open(songs_file_path, "r") as f:
    songs = json.load(f)

Next, create a list of all the lyrics in the dataset.

In [5]:
def fetch_all_lyrics(songs_file_path):
    with open(songs_file_path, "r") as f:
        songs = json.load(f)
    lyrics = []
    for _, sections in songs.items():

        for section in sections:
            lyrics.append(section["lyrics"])

    return lyrics

lyrics = fetch_all_lyrics(songs_file_path)

Now, tokenization can be started. For this, I have used the `tokenizers` library from the Hugging Face Transformers library. This library provides a variety of tokenizers for different models. Here, I have used a `WordPiece` model, used in models like BERT, RoBERTa, etc. Have a look at this [chapter](https://huggingface.co/learn/nlp-course/en/chapter6/6) from the Hugging Face course to learn more about this.

In [10]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.WordPiece()
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

trainer = trainers.WordPieceTrainer(
    vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train_from_iterator(lyrics, trainer=trainer)

# now save the tokenizer
tokenizer.save("../data/tokenizer_eng_lyrics.json")






# Tokenization for English and French Translation Data

The same process can be followed for the English and French translation data.

In [None]:
df = pd.read_csv("../data/eng_french.csv")


def get_english_training_corpus():
    return df["eng"].tolist()


def get_french_training_corpus():
    return df["fr"].tolist()

In [None]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.WordPiece()
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

trainer = trainers.WordPieceTrainer(
    vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train_from_iterator(get_english_training_corpus(), trainer=trainer)

# now save the tokenizer
tokenizer.save("data/tokenizer_eng.json")

In [None]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.WordPiece()
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

trainer = trainers.WordPieceTrainer(
    vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train_from_iterator(get_french_training_corpus(), trainer=trainer)
tokenizer.save("data/tokenizer_fr.json")

# Loading the Tokenizer

Huggigface makes it very easy to load a tokenizer. One can use `Tokenizer.from_file` to load a tokenizer from a file.

In [None]:
tokenizer = Tokenizer.from_file("data/tokenizer_eng.json")