In [1]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
train = dataset["train"].select(range(5000))
val = dataset["validation"].select(range(1000))
test = dataset["test"].select(range(500))

print(train[0])


  from .autonotebook import tqdm as notebook_tqdm


{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [2]:
print("Full first training example:\n")
print(train[0])

Full first training example:

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one m

In [3]:
print(train[0]['article'][:500])
print(train[0]['highlights'])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [4]:
import re
from bs4 import BeautifulSoup

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def preprocess_example(example):
    example['article'] = clean_text(example['article'])
    example['highlights'] = clean_text(example['highlights'])
    return example

train = train.map(preprocess_example)
val = val.map(preprocess_example)
test = test.map(preprocess_example)

print(train[0]['article'][:500])
print(train[0]['highlights'])


LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday . Young actor says he has no plans to fritter his cash away . Radcliffe's earnings from first five Potter films have been held in trust fund .


In [5]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def tokenize_function(example):
    # Tokenize article and highlights only
    inputs = tokenizer(example['article'], max_length=1024, truncation=True, padding='max_length')
    targets = tokenizer(example['highlights'], max_length=128, truncation=True, padding='max_length')

    # Return tokenized inputs AND keep the 'id' field unchanged
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
        'id': example['id']  # keep id here, unmodified
    }

# Apply to dataset

In [6]:
tokenized_train = train.map(tokenize_function, batched=True, remove_columns=train.column_names)
tokenized_val = val.map(tokenize_function, batched=True, remove_columns=val.column_names)
tokenized_test = test.map(tokenize_function, batched=True, remove_columns=test.column_names)


In [7]:
print(tokenized_train[0].keys())
print(len(tokenized_train[0]['input_ids']), len(tokenized_train[0]['labels']))


dict_keys(['id', 'input_ids', 'attention_mask', 'labels'])
1024 128


In [8]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer
import torch

# Load pre-trained BART summarization model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Using device:", device)


Using device: cpu


In [9]:
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in labels (used for padding) with tokenizer.pad_token_id
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return result


In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",        # <-- use eval_strategy instead of evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
)


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [19]:
trainer.train()




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
metrics = trainer.evaluate(tokenized_test)
print(metrics)


In [None]:
def summarize(text):
    inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors='pt').to(device)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=128, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example
print(summarize("NASA's new telescope has discovered ..."))


In [None]:
trainer.save_model("./bart_summarizer")
# To reload later:
model = BartForConditionalGeneration.from_pretrained("./bart_summarizer")
