In [10]:
import numpy as np

In [None]:
# login into huggingface
from huggingface_hub import notebook_login

notebook_login()

In [11]:
from datasets import load_dataset

songs = load_dataset('csv', data_files='./data/genius-expertiste_clean/songs.csv')
songs

DatasetDict({
    train: Dataset({
        features: ['lyrics', 'title'],
        num_rows: 30681
    })
})

In [12]:
# create train, validation, test splits (80/10/10)

from datasets import DatasetDict

songs = songs['train'].train_test_split(test_size=0.2)
songs_validation_test = songs['test'].train_test_split(test_size=0.5)

songs = DatasetDict({
    'train': songs['train'],
    'test': songs_validation_test['train'],
    'validation': songs_validation_test['test'],
    
})

songs

DatasetDict({
    train: Dataset({
        features: ['lyrics', 'title'],
        num_rows: 24544
    })
    test: Dataset({
        features: ['lyrics', 'title'],
        num_rows: 3068
    })
    validation: Dataset({
        features: ['lyrics', 'title'],
        num_rows: 3069
    })
})

In [15]:
# looking at a training sample

print('Lyrics: {}'.format(songs['train'][3]['lyrics']))
print('\n')
print('Title: {}'.format(songs['train'][3]['title']))


Lyrics:   [Intro] You know, sometimes I think God is playing a little game with me Looking down from heaven, laughing, and trying to see how much I can take Because the way things go, it's like a joke Nobody's had more shots at the moon, and missed, than me  [Verse 1] It's like I got my life stuck, stuck on rewind Trying to make something of myself, life's got something else in mind I'm fighting a losing game and I'm biding my time You won't be my man, do I understand? No, stop I don't want to hear another word about your 'Why not's' I bought your bullshit all before, now you trying to sell me more, babe?  [Chorus] Man who makes a beast out of himself got nothing to lose Sold my soul long ago, nothing left to choose I'm tired, tired of singing the blues I'm tired  [Verse 2] I'm walking down the streets, you wouldn't know what I was thinking It's just another white girl day, hey ribbons in my hair, and I am sinking down A double life, a sordid past, and I am drinking now I want to be ba

# Tokenization

In [7]:
# first load tokenizer
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [8]:
# testing out the tokenizer
print('IDs and Attention')
inputs = tokenizer('Wow I love this Song!')
print(inputs)

# check tokenizer
print('\n')
print('Tokenization Method')
print(tokenizer.convert_ids_to_tokens(inputs.input_ids))
print('\n')
# check max length of input
print('Max Input Lenght: {}'.format(tokenizer.model_max_length))

IDs and Attention
{'input_ids': [65801, 336, 3869, 714, 12554, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


Tokenization Method
['▁Wow', '▁I', '▁love', '▁this', '▁Song', '!', '</s>']


Max Input Lenght: 1000000000000000019884624838656


In [9]:
# specify limit for input and output

max_input_length = 300
max_target_length = 10

# define function to map to train,validation, and test datasets
def preprocess_function(examples):
    # feeding into tokenizer produces token ids and attention
    
    model_inputs = tokenizer(
        examples["lyrics"], max_length=max_input_length, truncation=True,
    )
    
    labels = tokenizer(
        examples["title"], max_length=max_target_length, truncation=True
    )
    
    # added additional label feature
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# map to each dataset
tokenized_data = songs.map(preprocess_function, batched=True)

Map:   0%|          | 0/24544 [00:00<?, ? examples/s]

Map:   0%|          | 0/3068 [00:00<?, ? examples/s]

Map:   0%|          | 0/3069 [00:00<?, ? examples/s]

In [10]:
# example of change applied to all datasets

print('Before:\n\t{}'.format(songs['train']))
print('\n')
print('After:\n\t{}'.format(tokenized_data['train']))

# 3 extra columns 

Before:
	Dataset({
    features: ['lyrics', 'title'],
    num_rows: 24544
})


After:
	Dataset({
    features: ['lyrics', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 24544
})


# Fine Tuning

In [12]:
# load model 

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.num_parameters()

In [15]:
# define hyperparameters and other arguments

from transformers import Seq2SeqTrainingArguments

batch_size = 32
num_train_epochs = 5
# show the training loss with every epoch
logging_steps = len(tokenized_data["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-genius",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    # number of checkpoints to save
    save_total_limit=1,
    num_train_epochs=num_train_epochs,
    # generate summaries during evaluation
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [16]:
# define compute metrics function

import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    
    # compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 3) for k, v in result.items()}

In [17]:
# load data collator for dynamic padding

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
# remove original text columns (we already have them but tokenized)

tokenized_data = tokenized_data.remove_columns(
    songs["train"].column_names
)

In [20]:
# instantiate the trainer

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

C:\Users\Jose\Desktop\genius_summarization\mt5-small-finetuned-genius is already a clone of https://huggingface.co/miscjose/mt5-small-finetuned-genius. Make sure you pull the latest changes with `repo.git_pull()`.


In [21]:
# train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,7.505,3.590725,13.8778,6.4731,13.8166,13.8748
2,3.9126,3.195448,22.4064,12.403,22.3315,22.4012
3,3.5275,3.077275,23.4504,13.3184,23.3889,23.4432
4,3.3868,3.058105,24.8367,14.3953,24.7172,24.7912
5,3.326,3.041686,24.7853,14.5203,24.696,24.7312


TrainOutput(global_step=3835, training_loss=4.331565504807692, metrics={'train_runtime': 9309.4485, 'train_samples_per_second': 13.182, 'train_steps_per_second': 0.412, 'total_flos': 3.802040745984e+16, 'train_loss': 4.331565504807692, 'epoch': 5.0})

In [22]:
# check metrics

trainer.evaluate()

{'eval_loss': 3.0416862964630127,
 'eval_rouge1': 24.7853,
 'eval_rouge2': 14.5203,
 'eval_rougeL': 24.696,
 'eval_rougeLsum': 24.7312,
 'eval_runtime': 46.9993,
 'eval_samples_per_second': 65.299,
 'eval_steps_per_second': 2.043,
 'epoch': 5.0}

In [23]:
# push model to hub

trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Upload file pytorch_model.bin:   0%|          | 1.00/1.12G [00:00<?, ?B/s]

To https://huggingface.co/miscjose/mt5-small-finetuned-genius
   26a7cec..b9acf17  main -> main

   26a7cec..b9acf17  main -> main

To https://huggingface.co/miscjose/mt5-small-finetuned-genius
   b9acf17..fa4b49c  main -> main

   b9acf17..fa4b49c  main -> main



'https://huggingface.co/miscjose/mt5-small-finetuned-genius/commit/b9acf172c0a2940bac201ac03c0c03c2a2d7cadb'

# Using the Fine-Tuned Model

In [16]:
from transformers import pipeline

hub_model_id = "miscjose/mt5-small-finetuned-genius"
summarizer = pipeline("summarization", model=hub_model_id)
# or
# summarizer = pipeline("summarization", tokenizer=tokenizer, config=args, model=model, device='cuda')

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [17]:
# run some examples with the test set 
def print_summary(idx):
    lyrics = songs["test"][idx]["lyrics"]
    title = songs["test"][idx]["title"]
    summary = summarizer(songs["test"][idx]["lyrics"])[0]["summary_text"]
    print('Lyrics: {}\n'.format(lyrics))
    print('Title: {}\n'.format(title))
    print('Summary: {}\n'.format(summary))

In [24]:
# check one example
print_summary(10)

Lyrics:   [Verse 1: Teyana Taylor] Shawty got potential, but he don't need a sponsor You should see his goons, more niggas than a concert Body like Teyana, stomach lookin' proper Eyes half closed cause he smokin' on that ganja Hol' up, hol' up Hard denims and cardigans, they all rugby He my lil bad boy, Sean Puffy Givin' me stacks, a rack T take that Smokin on that James Brown, this the payback I be his hood girl, I put that grind in him So inked up I can write my rhymes with 'em He give me all of his, but let me roll with mine And the shoes spiked up like a porcupine He love my Harlem ass, the way my swag pop A real bad bitch, never needed ass shots Two door coupe, all-white, whole thang And when I see him I'm like honey in that cocaine  [Chorus] Bad boy, real when I need a wrap And his only competition is the IRS Bad boy, a real one, I need that And his only competition is the IRS Make money money, make money money money (Make money money, make money money money) Now everybody say Ta