In [1]:
import pandas as pd
import numpy as np

In [3]:
# login into huggingface
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Loading Dataset

In [4]:
from datasets import load_dataset
songs = load_dataset("miscjose/genius")

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/3450 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/27596 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3450 [00:00<?, ? examples/s]

In [5]:
songs

DatasetDict({
    test: Dataset({
        features: ['title', 'lyrics'],
        num_rows: 3450
    })
    train: Dataset({
        features: ['title', 'lyrics'],
        num_rows: 27596
    })
    validation: Dataset({
        features: ['title', 'lyrics'],
        num_rows: 3450
    })
})

In [6]:
# looking at a training sample

print('Title: {}'.format(songs['train'][0]['title']))
print('\n')
print('Lyrics: {}'.format(songs['train'][0]['lyrics']))

Title: ring the alarm


Lyrics: Ring the alarm
I've been through this too long
But I'll be damned if I see another chick on your arm
Don't you ring the alarm
I've been through this too long
But I'll be damned if I see another chick on your arm
She gon' be rockin' chinchilla coats
If I let you go
Benz and the house off the coast
If I let you go
She gon' take everything I own
If I let you go
I can't let you go
Damned if I let you go
She gone rock them VVS stones
If I let you go (Coupes)
In the 'Bach carter rolls
If I let you go
She go profit everything I taught
If I let you go
I can't let you go
Damned if I let you go
Tell me how should I feel
When I know what I know
And my female intuition
Telling me you a dog
People told me 'bout the flames
I couldn't see through the smoke
When I need answers, accusations
What you mean you gone choke?
You can't stay, you gotta go
Ain't no other chick spending ya dough
This is taking a toll
The way the story unfolds
Not the picture perfect movie everyon

# Creating Metric Baseline

In [7]:
import evaluate
rouge_score = evaluate.load("rouge")

In [8]:
# lead 3 Baseline (in this case first 3 lyrics)

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")


def get_baseline(dataset, metric):
    summaries = ['\n'.join(text.split('\n')[:3]) for text in dataset["lyrics"]]
    return metric.compute(predictions=summaries, references=dataset["title"])

score = get_baseline(songs["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 3)) for rn in rouge_names)

rouge_dict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'rouge1': 5.896, 'rouge2': 2.412, 'rougeL': 5.819, 'rougeLsum': 5.865}

# Tokenization

In [9]:
# first load tokenizer

from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [10]:
# testing out the tokenizer

print('IDs and Attention')
inputs = tokenizer('Wow I love this Song!')
print(inputs)

# check tokenizer
print('\n')
print('Tokenization Method')
print(tokenizer.convert_ids_to_tokens(inputs.input_ids))
print('\n')
# check max length of input
print('Max Input Lenght: {}'.format(tokenizer.model_max_length))

IDs and Attention
{'input_ids': [65801, 336, 3869, 714, 12554, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


Tokenization Method
['▁Wow', '▁I', '▁love', '▁this', '▁Song', '!', '</s>']


Max Input Lenght: 1000000000000000019884624838656


In [11]:
# specify limit for input and output

max_input_length = 512
max_target_length = 10

# define function to map to train,validation, and test datasets
def preprocess_function(examples):
    # feeding into tokenizer produces token ids and attention
    
    model_inputs = tokenizer(
        examples["lyrics"], max_length=max_input_length, truncation=True,
    )
    
    labels = tokenizer(
        examples["title"], max_length=max_target_length, truncation=True
    )
    
    # added additional label feature
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# map to each dataset
tokenized_data = songs.map(preprocess_function, batched=True)

Map:   0%|          | 0/3450 [00:00<?, ? examples/s]

Map:   0%|          | 0/27596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3450 [00:00<?, ? examples/s]

In [12]:
# example of change applied to all datasets

print('Before:\n\t{}'.format(songs['train']))
print('\n')
print('After:\n\t{}'.format(tokenized_data['train']))

# 3 extra columns 

Before:
	Dataset({
    features: ['title', 'lyrics'],
    num_rows: 27596
})


After:
	Dataset({
    features: ['title', 'lyrics', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 27596
})


# Fine Tuning

In [13]:
# load model 

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.num_parameters()

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

300176768

In [14]:
# define hyperparameters and other arguments

from transformers import Seq2SeqTrainingArguments

batch_size = 32
num_train_epochs = 5 
# show the training loss with every epoch
logging_steps = len(tokenized_data["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-genius",
    evaluation_strategy="epoch",
    learning_rate=4.0e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    # number of checkpoints to save
    save_total_limit=1,
    num_train_epochs=num_train_epochs,
    # generate summaries during evaluation
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [15]:
# define compute metrics function

import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    
    # compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # extract the scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 3) for k, v in result.items()}

In [16]:
# load data collator for dynamic padding

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
# remove original text columns (we already have them but tokenized)

tokenized_data = tokenized_data.remove_columns(
    songs["train"].column_names
)

In [18]:
# instantiate the trainer

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/miscjose/mt5-small-finetuned-genius into local empty directory.


In [19]:
# train the model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,14.3296,7.807046,0.557,0.089,0.557,0.558


Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


TrainOutput(global_step=863, training_loss=14.325111340537132, metrics={'train_runtime': 167.6341, 'train_samples_per_second': 164.62, 'train_steps_per_second': 5.148, 'total_flos': 284987821670400.0, 'train_loss': 14.325111340537132, 'epoch': 1.0})

In [20]:
# check metrics

trainer.evaluate()

{'eval_loss': 7.807045936584473,
 'eval_rouge1': 0.557,
 'eval_rouge2': 0.089,
 'eval_rougeL': 0.557,
 'eval_rougeLsum': 0.558,
 'eval_runtime': 20.8945,
 'eval_samples_per_second': 165.116,
 'eval_steps_per_second': 5.169,
 'epoch': 1.0}

In [21]:
# push model to hub

trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/1.12G [00:00<?, ?B/s]

To https://huggingface.co/miscjose/mt5-small-finetuned-genius
   c368014..18912c4  main -> main

   c368014..18912c4  main -> main

To https://huggingface.co/miscjose/mt5-small-finetuned-genius
   18912c4..c4f2b55  main -> main

   18912c4..c4f2b55  main -> main



'https://huggingface.co/miscjose/mt5-small-finetuned-genius/commit/18912c4861a4577d06c456799ad9a216fac886bd'

# Using the Fine-Tuned Model

In [23]:
from transformers import pipeline

hub_model_id = "miscjose/mt5-small-finetuned-genius"
summarizer = pipeline("summarization", model=hub_model_id)
# or
# summarizer = pipeline("summarization", tokenizer=tokenizer, config=args, model=model, device='cuda')

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [24]:
# run some examples with the test set 
def print_summary(idx):
    lyrics = songs["test"][idx]["lyrics"]
    title = songs["test"][idx]["title"]
    summary = summarizer(songs["test"][idx]["lyrics"])[0]["summary_text"]
    print('Lyrics: {}\n'.format(lyrics))
    print('Title: {}\n'.format(title))
    print('Summary: {}\n'.format(summary))

In [29]:
# check one example
print_summary(0)

Lyrics: After all the shit we did
You gon' make me have to tell somebody, make me tell somebody
Plead the fifth
But in my mind I wanna tell somebody, wanna tell somebody
The way I hit it, ohhhh you gon' make me have to surf on that ocean
I might drown in your body let it drip
But you so bad I gotta tell somebody, gotta tell somebody
Oh, oh, oh my my my
You really blowing my mind
Oh, oh, oh my my my
I ain't wasting no time
Tell her hop on that quick
And shade, fuck it up
Do your dance, run it back
We been on one
Fall Back
I will not snitch
Fuck it up, do your dance
Turn around and baby run it back
I wish I could tell somebody
First off, no I ain't the type to go kiss and tell everything
And I'm so drunk that if I try it I misspell everything
And you so drunk that you dance to like everything that come on
We been mixing up these feelings with Hennessy and Patron
Now you feeling the wave and I'm 'bout to jump in the deep-end
I know that you tryna escape, and be my little secret
Oh, oh, oh