In [5]:
pip install transformers datasets peft accelerate bitsandbytes huggingface_hub




Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install transformers --upgrade

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip show transformers


Name: transformers
Version: 4.52.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /mnt/Data/sarmistha/.miniconda3/envs/finbot/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: bert-score, kvpress, peft, sentence-transformers, trl, unsloth_zoo
Note: you may need to restart the kernel to use updated packages.


In [10]:
from huggingface_hub import login

login(token="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") 

In [11]:
import torch
from datasets import load_dataset
from transformers import LEDTokenizer, LEDForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Configuration
model_id = "allenai/led-base-16384"
dataset_id = "ishani29/mahakumbh-news-summarization"
split = "train"
max_input_length = 4096  # LED supports up to 16,384
max_target_length = 512
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
tokenizer = LEDTokenizer.from_pretrained(model_id)
model = LEDForConditionalGeneration.from_pretrained(model_id).to(device)

# Load dataset
dataset = load_dataset(dataset_id, split=split)

# Preprocessing function
def preprocess(example):
    input_text = f"{example['text']}"
    target_text = example['summary']

    model_input = tokenizer(
        input_text,
        truncation=True,
        max_length=max_input_length,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        label = tokenizer(
            target_text,
            truncation=True,
            max_length=max_target_length,
            padding="max_length"
        )

    model_input["labels"] = label["input_ids"]

    # Set global attention on <s> token (first token)
    model_input["global_attention_mask"] = [1] + [0] * (len(model_input["input_ids"]) - 1)

    return model_input

# Tokenize dataset
tokenized = dataset.map(preprocess, remove_columns=dataset.column_names)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments
training_args = TrainingArguments(
    output_dir="./led-news-summarizer",
    per_device_train_batch_size=1,  # larger input size → smaller batch
    num_train_epochs=3,
    learning_rate=3e-5,
    fp16=True,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    report_to="none",
    push_to_hub=False
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
trainer.train()


  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
20,9.4527
40,4.5748
60,2.9578
80,1.9162
100,1.0512
120,0.6218
140,0.4208
160,0.3685
180,0.3287
200,0.3266




TrainOutput(global_step=1278, training_loss=0.5099634579462997, metrics={'train_runtime': 842.8294, 'train_samples_per_second': 3.029, 'train_steps_per_second': 1.516, 'total_flos': 6893622943285248.0, 'train_loss': 0.5099634579462997, 'epoch': 3.0})

In [12]:
model.push_to_hub("ishani29/led-mahakumbh")
tokenizer.push_to_hub("ishani29/led-mahakumbh")


model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ishani29/led-mahakumbh/commit/fd53c540324139f68d3b2a89b0f7de8c45fec3fd', commit_message='Upload tokenizer', commit_description='', oid='fd53c540324139f68d3b2a89b0f7de8c45fec3fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ishani29/led-mahakumbh', endpoint='https://huggingface.co', repo_type='model', repo_id='ishani29/led-mahakumbh'), pr_revision=None, pr_num=None)

In [9]:
pip install evaluate bert-score




Note: you may need to restart the kernel to use updated packages.


In [13]:
import torch
from datasets import load_dataset
from transformers import LEDTokenizer, LEDForConditionalGeneration
from evaluate import load as load_metric
from tqdm import tqdm
import csv

# Paths
model_path = "ishani29/led-mahakumbh"
dataset_id = "ishani29/mahakumbh-news-summarization"
split = "test"
output_file = "generated_summaries_led.csv"

# Load model & tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = LEDTokenizer.from_pretrained(model_path)
model = LEDForConditionalGeneration.from_pretrained(model_path).to(device)
model.eval()

# Load test dataset
dataset = load_dataset(dataset_id, split=split)

# Load metrics
rouge = load_metric("rouge")
bleu = load_metric("bleu")
bertscore = load_metric("bertscore")

# Generate summaries
generated_summaries = []
reference_summaries = []

for example in tqdm(dataset, desc="Evaluating"):
    input_text = example["text"]
    reference = example["summary"]

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=4096  # LED supports long sequences
    ).to(device)

    # Set global attention on <s> token
    global_attention_mask = torch.zeros_like(inputs["input_ids"])
    global_attention_mask[:, 0] = 1  # global attention on first token

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            global_attention_mask=global_attention_mask,
            max_length=150,
            num_beams=4,
            early_stopping=True
        )

    decoded_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    generated_summaries.append(decoded_summary)
    reference_summaries.append(reference)

# Save to CSV
with open(output_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Generated Summary", "Reference Summary"])
    for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
        writer.writerow([gen_summary, ref_summary])

# Evaluate ROUGE
rouge_result = rouge.compute(predictions=generated_summaries, references=reference_summaries)
print("\n🔸 ROUGE Scores:")
for key in ["rouge1", "rouge2", "rougeL"]:
    print(f"{key}: {rouge_result[key]:.4f}")

# Evaluate BLEU
tokenized_preds = [' '.join(pred.split()) for pred in generated_summaries]
tokenized_refs = [[' '.join(ref.split())] for ref in reference_summaries]
bleu_result = bleu.compute(predictions=tokenized_preds, references=tokenized_refs)
print(f"\n🔸 BLEU Score: {bleu_result['bleu']:.4f}")

# Evaluate BERTScore
bertscore_result = bertscore.compute(predictions=generated_summaries,
                                     references=reference_summaries,
                                     lang="en")
bert_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])
print(f"\n🔸 BERTScore (F1): {bert_f1:.4f}")


tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [02:25<00:00,  1.04it/s]



🔸 ROUGE Scores:
rouge1: 0.5698
rouge2: 0.3485
rougeL: 0.4466

🔸 BLEU Score: 0.3173


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔸 BERTScore (F1): 0.9189


In [14]:
import pandas as pd
import evaluate
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ----- CONFIG -----
csv_input_path = "generated_summaries_led.csv"  # Change if your CSV has a different name
eval_output_path = "generated_summaries_led_metrics.csv"
model_id = "ishani29/led-mahakumbh"  # Replace with your Gemma model checkpoint ID if calculating perplexity

# ----- LOAD DATA -----
df = pd.read_csv(csv_input_path)
references = df["Reference Summary"].tolist()
predictions = df["Generated Summary"].tolist()
# articles = df["article"].tolist()

# ----- EVALUATION METRICS -----
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

print("🔍 Computing BLEU...")
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print("🔍 Computing ROUGE...")
rouge_score = rouge.compute(predictions=predictions, references=references)

print("🔍 Computing METEOR...")
meteor_score = meteor.compute(predictions=predictions, references=references)

print("🔍 Computing BERTScore...")
bert_score = bertscore.compute(predictions=predictions, references=references, lang="en")

# # ----- (OPTIONAL) PERPLEXITY -----
# try:
#     print("🔍 Calculating Perplexity (optional)...")
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
#     model.eval()

#     perplexities = []
#     for article, reference in tqdm(zip(articles, references), total=len(articles), desc="Calculating Perplexity"):
#         inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=512).to(model.device)
#         labels = tokenizer(reference, return_tensors="pt", truncation=True, max_length=150).input_ids.to(model.device)

#         with torch.no_grad():
#             loss = model(input_ids=inputs.input_ids, labels=labels).loss
#             perplexities.append(torch.exp(loss).item())

#     df["perplexity"] = perplexities
#     avg_perplexity = sum(perplexities) / len(perplexities)
# except Exception as e:
#     print(f"⚠️ Perplexity skipped due to error: {e}")
#     avg_perplexity = None

# ----- SAVE METRICS -----
metrics = {
    "BLEU": bleu_score["score"] / 100,
    "ROUGE-1": rouge_score["rouge1"],
    "ROUGE-2": rouge_score["rouge2"],
    "ROUGE-L": rouge_score["rougeL"],
    "METEOR": meteor_score["meteor"],
    "BERTScore_F1": sum(bert_score["f1"]) / len(bert_score["f1"]),
#     "Avg Perplexity": avg_perplexity if avg_perplexity is not None else "N/A"
}

pd.DataFrame([metrics]).to_csv(eval_output_path, index=False)

# ----- PRINT METRICS -----
print("\n📊 Evaluation Summary:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")
print(f"\n✅ Evaluation metrics saved to: '{eval_output_path}'")


[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🔍 Computing BLEU...
🔍 Computing ROUGE...
🔍 Computing METEOR...
🔍 Computing BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 Evaluation Summary:
BLEU: 0.3173
ROUGE-1: 0.5698
ROUGE-2: 0.3485
ROUGE-L: 0.4466
METEOR: 0.5045
BERTScore_F1: 0.9189

✅ Evaluation metrics saved to: 'generated_summaries_led_metrics.csv'
