In [1]:
!pip install datasets transformers evaluate sacrebleu bert_score rouge-score gradio

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-5.24.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-

In [4]:
import torch
import gradio as gr
from datasets import load_dataset, DatasetDict
from transformers import BartTokenizer
from transformers import BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from evaluate import load

In [3]:
# Load dataset
dataset = load_dataset("multi_news")

# Train/validation split
split = dataset["train"].train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({
    "train": split["train"],
    "validation": split["test"],
    "test": dataset["validation"]
})

# Load tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Preprocessing function
def preprocess(example):
    model_input = tokenizer(example["document"], max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["summary"], max_length=256, truncation=True, padding="max_length")
    model_input["labels"] = labels["input_ids"]
    return model_input

# Tokenize
tokenized = dataset.map(preprocess, batched=True, remove_columns=["document", "summary"])
tokenized.save_to_disk("tokenized_multinews")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

multi_news.py:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

The repository for multi_news contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/multi_news.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train.src.cleaned:   0%|          | 0.00/548M [00:00<?, ?B/s]

train.tgt:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

val.src.cleaned:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

val.tgt:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

test.src.cleaned:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

test.tgt:   0%|          | 0.00/7.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Map:   0%|          | 0/40474 [00:00<?, ? examples/s]



Map:   0%|          | 0/4498 [00:00<?, ? examples/s]

Map:   0%|          | 0/5622 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/40474 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4498 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5622 [00:00<?, ? examples/s]

In [7]:
# Load model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to("cuda")

rouge = load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=[l[0] for l in labels])
    return {k: round(v*100, 2) for k, v in result.items()}

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model)

# Training arguments
args = Seq2SeqTrainingArguments(
    output_dir="./results_multinews",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs_multinews",
    report_to="none"
)


# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].select(range(1000)),
    eval_dataset=tokenized["validation"].select(range(100)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.0597,2.401961,9.8,3.19,7.38,7.41
2,2.5109,2.368339,9.77,2.99,7.32,7.33
3,2.3864,2.358691,10.04,3.26,7.48,7.5




TrainOutput(global_step=1500, training_loss=2.6523271484375, metrics={'train_runtime': 348.6573, 'train_samples_per_second': 8.604, 'train_steps_per_second': 4.302, 'total_flos': 1829209374720000.0, 'train_loss': 2.6523271484375, 'epoch': 3.0})

In [8]:
# Metrics
bleu = load("sacrebleu")
bert = load("bertscore")

test_set = tokenized["test"].select(range(100))
labels = []
preds = []

model.eval()
for sample in test_set:
    inputs = torch.tensor(sample["input_ids"]).unsqueeze(0).to("cuda")
    mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to("cuda")
    output = model.generate(inputs, attention_mask=mask, max_length=256)
    labels.append(tokenizer.decode(sample["labels"], skip_special_tokens=True))
    preds.append(tokenizer.decode(output[0], skip_special_tokens=True))



rouge_scores = rouge.compute(predictions=preds, references=labels)
bleu_score = bleu.compute(predictions=preds, references=[[r] for r in labels])
bert_score = bert.compute(predictions=preds, references=labels, lang="en")

print("ROUGE:", {k: round(v * 100, 2) for k, v in rouge_scores.items()})
print("BLEU Score:", round(bleu_score["score"], 2))
print("BERTScore F1:", round(sum(bert_score["f1"]) / len(bert_score["f1"]) * 100, 2))


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE: {'rouge1': np.float64(40.11), 'rouge2': np.float64(13.4), 'rougeL': np.float64(20.79), 'rougeLsum': np.float64(20.86)}
BLEU Score: 11.24
BERTScore F1: 85.76


In [9]:
def summarize_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=1024).to("cuda")
    output = model.generate(**inputs, max_length=256)
    return tokenizer.decode(output[0], skip_special_tokens=True)

gr.Interface(fn=summarize_news, inputs="textbox", outputs="textbox", title="📰 Multi-News Summarizer").launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://500f930e782c3d9f5d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




**References:**
* https://huggingface.co/datasets/multi_news
* https://huggingface.co/facebook/bart-base
* https://huggingface.co/docs/transformers/model_doc/bart
* https://huggingface.co/docs/evaluate/index
* https://www.gradio.app


**UB box link to Video Recording:**

https://buffalo.box.com/s/qxc3ain1f6ic16nwaydd136381z056db