## Step 2: Decoding for downstream generation tasks

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd

In [2]:
data_frame = []

tokenizer = AutoTokenizer.from_pretrained("gniemiec/t5-small-finetuned-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("gniemiec/t5-small-finetuned-xsum")

dataset = load_dataset("xsum")
test_dataset = dataset['test']
test_dataset

Using custom data configuration default
Found cached dataset xsum (C:/Users/harsh/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 11334
})

In [3]:
for index, item in enumerate(test_dataset):
    if index%10==0:
        print(index)
    if index == 100:
        break
    document = item["document"]
    input_ids = tokenizer(document, return_tensors="pt").input_ids
    
    outputs_greedy = model.generate(input_ids, max_length=30, output_scores=True, return_dict_in_generate=True)
    outputs_beam = model.generate(input_ids, max_length=30, num_beams=3, early_stopping=True, output_scores=True, return_dict_in_generate=True)
    outputs_topk = model.generate(input_ids, do_sample=True, max_length=30, top_k=20, output_scores=True, return_dict_in_generate=True)
    outputs_topp = model.generate(input_ids, do_sample=True, max_length=30, top_p=0.7, top_k=0, output_scores=True, return_dict_in_generate=True)
    
    generated_sequence_greedy = tokenizer.batch_decode(outputs_greedy.sequences, skip_special_tokens=True)[0]
    generated_sequence_beam = tokenizer.batch_decode(outputs_beam.sequences, skip_special_tokens=True)[0]
    generated_sequence_topk = tokenizer.batch_decode(outputs_topk.sequences, skip_special_tokens=True)[0]
    generated_sequence_topp = tokenizer.batch_decode(outputs_topp.sequences, skip_special_tokens=True)[0]
    
    data_frame.append([item["summary"], generated_sequence_greedy, generated_sequence_beam, generated_sequence_topk, generated_sequence_topp])

Token indices sequence length is longer than the specified maximum sequence length for this model (775 > 512). Running this sequence through the model will result in indexing errors


0
10
20
30
40
50
60
70
80
90
100


In [6]:
data_frame = pd.DataFrame(data_frame, columns=['summary', "generated_sequence_greedy", "generated_sequence_beam", "generated_sequence_topk", "generated_sequence_topp"])

In [8]:
data_frame.head(20)

Unnamed: 0,summary,generated_sequence_greedy,generated_sequence_beam,generated_sequence_topk,generated_sequence_topp
0,"There is a ""chronic"" need for more housing for...",",,,,,,,,, who has been in prison for 20 years,...","in Wales, a Welsh charity has urged people to ...","at Emmaus South Wales, who has built more than...","has been turned into a residential, temporary ..."
1,"A man has appeared in court after firearms, am...","Police have recovered three firearms, ammuniti...","Police have recovered three firearms, ammuniti...",is charged with murdering a man on suspicion o...,to collect a five-figure sum of money.
2,Four people accused of kidnapping and torturin...,",,, have been charged with hate crimes and agg...",",,, have been charged with hate crimes, aggrav...","and Tesfaye Cooper, all 18, allegedly used a s...","and Riley Jones, all 18, and Jason Hill, all 1..."
3,West Brom have appointed Nicky Hammond as tech...,keeper David West Brom has been appointed a yo...,'s former Arsenal goalkeeper has played for th...,"City, Arsenal, Arsenal and Manchester United s...",Manchester United goalkeeper Carlos Nenteithov...
4,The pancreas can be triggered to regenerate it...,",,,,,,,,,,,,, and the ability",". ""I was glad to have something to eat,"" he sa...","., with the weight of about 7kg (3.5lbs) from ...","and and went on a fasting diet. ""This has pote..."
5,Since their impending merger was announced in ...,",,,,,,,,, and,,, and a merger",",,,,,,,,,, and,,.,",and will make a significant contribution to th...,has a wide range of opportunities for at Snell...
6,"A ""medal at any cost"" approach created a ""cult...",",,,,, has apologised for the sexist behaviour ...","has apologised to British Cycling for a ""horri...",is unsure why she should take on a medal at th...,"of the Olympics, has asked BBC Sport if she ha..."
7,Have you heard the one about the computer prog...,",,,,,,, and a new generation of comedians,,,,",",,,,,,s are becoming more and more popular in ...",has seen a booming rise in US comedy clubs as ...,TV. We're already hoping for a.s. who lives on...
8,The reaction from BT's investors told us much ...,",,, and a new board to run Openreach.,,,,, BT","BT's giant pension scheme, which has a deficit...","to BT's giant Pension scheme, Openreach, is a ...",- an English-language government pension schem...
9,Manager Brendan Rodgers is sure Celtic can exp...,has been a key player in the Scottish Cup and ...,is looking forward to the Scottish Cup semi-fi...,- it's being a tough one for Celtic and has be...,is keen to give him a bit of momentum in the S...
