In [1]:
import pandas as pd 
import torch  
import nltk
import gc
import os

from datasets import Dataset
from transformers import BertTokenizer, BertModel, BertLMHeadModel, EncoderDecoderModel, Seq2SeqTrainingArguments, Trainer, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
train_ds = pd.read_csv("../Dataset/train.csv")
train_ds.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [5]:
len(train_ds)

287113

In [6]:
sampled_df = train_ds.sample(n=10000, random_state=42)

In [7]:
train_df, val_df = train_test_split(sampled_df, test_size=0.2, random_state=42)

len(train_df), len(val_df)

(8000, 2000)

In [8]:
train_dataset = Dataset.from_pandas(train_df[['article', 'highlights']]) 
val_dataset = Dataset.from_pandas(val_df[['article', 'highlights']])

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocessing(dataset):
    inputs = ['summarize: ' + sum for sum in dataset['article']]
    model_inputs = tokenizer(inputs, max_length=384, truncation=True, padding="max_length", return_tensors='pt')

    labels = tokenizer(text_target=dataset['highlights'], max_length=64, truncation=True, padding="max_length", return_tensors='pt')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs
    

tokenized_train_df = train_dataset.map(preprocessing, batched=True, batch_size=32)
tokenized_val_df = val_dataset.map(preprocessing, batched=True, batch_size=32)

Map: 100%|██████████████████████████████████████████████████████████████████| 8000/8000 [05:03<00:00, 26.40 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 2000/2000 [01:13<00:00, 27.17 examples/s]


In [11]:
encoder = BertModel.from_pretrained('bert-base-uncased')

decoder_config = BertLMHeadModel.from_pretrained('bert-base-uncased').config
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True  
decoder = BertLMHeadModel.from_pretrained('bert-base-uncased', config=decoder_config)

model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = tokenizer.vocab_size
model.config.max_length = 64
model.config.min_length = 10
model.config.length_penalty = 2.0
model.config.num_beams = 4

model.to(device)
print(f"Model loaded and moved to {device}")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.outpu

Model loaded and moved to cuda


In [12]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=4,
    save_total_limit=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_val_df,
    data_collator=data_collator,
)

trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,34.3355,4.136777
2,31.9709,3.920922
3,30.2426,3.866306




TrainOutput(global_step=750, training_loss=34.90716735839844, metrics={'train_runtime': 3150.0062, 'train_samples_per_second': 7.619, 'train_steps_per_second': 0.238, 'total_flos': 1.104217380864e+16, 'train_loss': 34.90716735839844, 'epoch': 3.0})

In [14]:
def generate_summary(article, max_length=64, min_length=10, num_beams=4, model=model):
    inputs = tokenizer(
        "summarize: " + article,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_tensors='pt'
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        decoder_start_token_id=tokenizer.cls_token_id,  
        eos_token_id=tokenizer.sep_token_id,            
        pad_token_id=tokenizer.pad_token_id,         
        max_length=max_length,
        min_length=min_length,
        num_beams=num_beams,
        length_penalty=2.0,
        early_stopping=True
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [15]:
test_article='''The CNN/Daily Mail dataset is a popular dataset for text summarization tasks. It contains news 
articles from CNN and Daily Mail, along with human-written bullet point summaries. The dataset 
was originally created for question answering, but has been widely used for abstractive and 
extractive summarization. The articles are typically several hundred words long, while the 
summaries are usually around 3-4 bullet points, each containing a single sentence.'''

summary = generate_summary(test_article)

In [16]:
summary

"the u. s. government says it will be able to use the internet. it's the first time it uses the internet. it's the first time it used the internet. it's the first time it uses the internet."