In [4]:
pip install transformers datasets peft accelerate bitsandbytes huggingface_hub




Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from datasets import Dataset

# Load CSV
df = pd.read_csv("dataset.csv")

# Standardize column names
df = df.rename(columns={"Full Article": "text", "Summary": "summary"})

# Drop rows with missing values
df.dropna(subset=["text", "summary"], inplace=True)

# Drop rows with error messages in summaries
error_message = (
    "I'm sorry, but I cannot provide a summary of the news article as the content seems to be missing or unavailable due to a \"404 - Not Found\" error. "
    "Please try accessing the article again or provide a different link for me to summarize."
)

df = df[df["summary"] != error_message]

# Reset index
df.reset_index(drop=True, inplace=True)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Optional: train-test split
dataset = dataset.train_test_split(test_size=0.15)

# Save locally (optional)
dataset.save_to_disk("mahakumbh_dataset_cleaned_hf")


Saving the dataset (0/1 shards):   0%|          | 0/851 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/151 [00:00<?, ? examples/s]

In [8]:
from huggingface_hub import login

login(token="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")  # Get from https://huggingface.co/settings/tokens

dataset.push_to_hub("ishani29/mahakumbh-news-summarization")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/496 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/ishani29/mahakumbh-news-summarization/commit/ce3b5092132d94ecd77e72ea51765ff4f83caea6', commit_message='Upload dataset', commit_description='', oid='ce3b5092132d94ecd77e72ea51765ff4f83caea6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ishani29/mahakumbh-news-summarization', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ishani29/mahakumbh-news-summarization'), pr_revision=None, pr_num=None)

In [21]:
from transformers import AutoTokenizer
from datasets import load_dataset
split = "train"
dataset_id = "ishani29/mahakumbh-news-summarization"
dataset = load_dataset(dataset_id)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/151 [00:00<?, ? examples/s]



In [22]:
from peft import get_peft_model, LoraConfig, TaskType
import torch
from transformers import AutoModelForSeq2SeqLM
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)

# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_2_SEQ_LM,
#     r=8,
#     lora_alpha=32,
#     lora_dropout=0.1,
#     bias="none"
# )

# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()


In [23]:
tokenized_datasets["train"]

Dataset({
    features: ['Title', 'Link', 'text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 851
})

In [24]:
tokenized_datasets["test"]

Dataset({
    features: ['Title', 'Link', 'text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 151
})

In [21]:
# train_dataset = dataset["train"]
# eval_dataset = dataset["test"]

In [25]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    eval_strategy="epoch",  # This is valid
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    hub_model_id="ishani29/mahakumbh-flan-t5",
    report_to="none",  # Optional: disables wandb
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Seq2SeqTrainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


TrainOutput(global_step=321, training_loss=0.0, metrics={'train_runtime': 87.0351, 'train_samples_per_second': 29.333, 'train_steps_per_second': 3.688, 'total_flos': 1733029914710016.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
pip install rouge_score

In [None]:
pip install transformers --upgrade

In [None]:
pip show transformers


In [23]:
trainer.push_to_hub()


CommitInfo(commit_url='https://huggingface.co/ishani29/mahakumbh-flan-t5/commit/75562cdc4bb2b6358b7a05bda384f2473664147f', commit_message='End of training', commit_description='', oid='75562cdc4bb2b6358b7a05bda384f2473664147f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ishani29/mahakumbh-flan-t5', endpoint='https://huggingface.co', repo_type='model', repo_id='ishani29/mahakumbh-flan-t5'), pr_revision=None, pr_num=None)

In [14]:
model.save_pretrained("mahakumbh-t5-lora")
tokenizer.save_pretrained("mahakumbh-t5-lora")


('mahakumbh-t5-lora/tokenizer_config.json',
 'mahakumbh-t5-lora/special_tokens_map.json',
 'mahakumbh-t5-lora/spiece.model',
 'mahakumbh-t5-lora/added_tokens.json',
 'mahakumbh-t5-lora/tokenizer.json')

In [None]:
#Inference

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate
import pandas as pd
import nltk
from tqdm import tqdm
import csv
nltk.download("punkt")

# ----- CONFIG -----
model_id = "ishani29/mahakumbh-news-summarization"
dataset_id = "ishani29/mahakumbh-news-summarization"
split = "test"
max_input_length = 512
max_target_length = 150
device = "cuda" if torch.cuda.is_available() else "cpu"
csv_output_path = "mahakumbh_test_predictions.csv"

# ----- LOAD MODEL -----
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

# ----- LOAD DATASET -----
dataset = load_dataset(dataset_id, split=split)

# Optional: shrink for speed/debug
# dataset = dataset.select(range(100))

# ----- INITIALIZE CSV -----
with open(csv_output_path, mode="w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["article", "reference_summary", "generated_summary"])  # no perplexity yet

# ----- INFERENCE & SAVE -----
for example in tqdm(dataset, desc="Generating Summaries"):
    article = example["text"]
    reference = example["summary"]

    inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_target_length)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    with open(csv_output_path, mode="a", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([article, reference, summary])

# ----- LOAD RESULTS FOR EVALUATION -----
df = pd.read_csv(csv_output_path)
references = df["reference_summary"].tolist()
predictions = df["generated_summary"].tolist()
articles = df["article"].tolist()

# ----- EVALUATION METRICS -----
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_score = rouge.compute(predictions=predictions, references=references)
meteor_score = meteor.compute(predictions=predictions, references=references)
bert_score = bertscore.compute(predictions=predictions, references=references, lang="en")

# ----- PERPLEXITY -----
perplexities = []
for article, reference in tqdm(zip(articles, references), total=len(articles), desc="Calculating Perplexity"):
    inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
    labels = tokenizer(reference, return_tensors="pt", truncation=True, max_length=max_target_length).input_ids.to(device)
    with torch.no_grad():
        loss = model(input_ids=inputs.input_ids, labels=labels).loss
        perplexities.append(torch.exp(loss).item())

df["perplexity"] = perplexities
df.to_csv(csv_output_path, index=False)

avg_perplexity = sum(perplexities) / len(perplexities)

# ----- SAVE METRICS -----
metrics = {
    "BLEU": bleu_score["score"],
    "ROUGE-1": rouge_score["rouge1"],
    "ROUGE-2": rouge_score["rouge2"],
    "ROUGE-L": rouge_score["rougeL"],
    "METEOR": meteor_score["meteor"],
    "BERTScore_F1": sum(bert_score["f1"]) / len(bert_score["f1"]),
    "Avg Perplexity": avg_perplexity
}
pd.DataFrame([metrics]).to_csv("mahakumbh_eval_metrics.csv", index=False)

# ----- PRINT METRICS -----
print("\n📊 Evaluation Summary:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")
print("\n✅ CSVs saved: 'mahakumbh_test_predictions.csv', 'mahakumbh_eval_metrics.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Generating Summaries: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [01:38<00:00,  1.53it/s]


Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Calculating Perplexity: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [00:06<00:00, 23.45it/s]


📊 Evaluation Summary:
BLEU: 0.4938
ROUGE-1: 0.1865
ROUGE-2: 0.0960
ROUGE-L: 0.1540
METEOR: 0.1013
BERTScore_F1: 0.8495
Avg Perplexity: 8.5157

✅ CSVs saved: 'mahakumbh_test_predictions.csv', 'mahakumbh_eval_metrics.csv'





In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate
import pandas as pd
import nltk
from tqdm import tqdm
import csv
nltk.download("punkt")

# ----- CONFIG -----
model_id = "ishani29/mahakumbh-flan-t5"
dataset_id = "ishani29/mahakumbh-news-summarization"
split = "test"
max_input_length = 512
max_target_length = 150
min_target_length = 80  # to avoid too short summaries
num_beams = 4           # for better results than greedy decoding
device = "cuda" if torch.cuda.is_available() else "cpu"
csv_output_path = "mahakumbh_test_predictions_flan-t5.csv"

# ----- LOAD MODEL -----
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

# ----- LOAD DATASET -----
dataset = load_dataset(dataset_id, split=split)

# ----- INITIALIZE CSV -----
with open(csv_output_path, mode="w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["article", "reference_summary", "generated_summary"])

# ----- INFERENCE & SAVE -----
for example in tqdm(dataset, desc="Generating Summaries"):
    article = example["text"]
    reference = example["summary"]

    inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_target_length,
            min_length=min_target_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=3,
            repetition_penalty=1.2,
            length_penalty=1.0,  # 1.0 = neutral; >1 discourages long outputs
            output_attentions=False,  # set True if you want to visualize attention
        )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    with open(csv_output_path, mode="a", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([article, reference, summary])

# ----- LOAD RESULTS FOR EVALUATION -----
df = pd.read_csv(csv_output_path)
references = df["reference_summary"].tolist()
predictions = df["generated_summary"].tolist()
articles = df["article"].tolist()

# ----- EVALUATION METRICS -----
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_score = rouge.compute(predictions=predictions, references=references)
meteor_score = meteor.compute(predictions=predictions, references=references)
bert_score = bertscore.compute(predictions=predictions, references=references, lang="en")

# ----- PERPLEXITY -----
perplexities = []
for article, reference in tqdm(zip(articles, references), total=len(articles), desc="Calculating Perplexity"):
    inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
    labels = tokenizer(reference, return_tensors="pt", truncation=True, max_length=max_target_length).input_ids.to(device)
    with torch.no_grad():
        loss = model(input_ids=inputs.input_ids, labels=labels).loss
        perplexities.append(torch.exp(loss).item())

df["perplexity"] = perplexities
df.to_csv(csv_output_path, index=False)

avg_perplexity = sum(perplexities) / len(perplexities)

# ----- SAVE METRICS -----
metrics = {
    "BLEU": bleu_score["score"],
    "ROUGE-1": rouge_score["rouge1"],
    "ROUGE-2": rouge_score["rouge2"],
    "ROUGE-L": rouge_score["rougeL"],
    "METEOR": meteor_score["meteor"],
    "BERTScore_F1": sum(bert_score["f1"]) / len(bert_score["f1"]),
    "Avg Perplexity": avg_perplexity
}
pd.DataFrame([metrics]).to_csv("mahakumbh_eval_metrics.csv", index=False)

# ----- PRINT METRICS -----
print("\n📊 Evaluation Summary:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")
print("\n✅ CSVs saved: 'mahakumbh_test_predictions.csv', 'mahakumbh_eval_metrics.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/496 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/881k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/851 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/151 [00:00<?, ? examples/s]

Generating Summaries: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [06:20<00:00,  2.52s/it]
[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Calculating Perplexity: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 


📊 Evaluation Summary:
BLEU: 12.7642
ROUGE-1: 0.3784
ROUGE-2: 0.1805
ROUGE-L: 0.2742
METEOR: 0.2732
BERTScore_F1: 0.8744
Avg Perplexity: 8.2018

✅ CSVs saved: 'mahakumbh_test_predictions.csv', 'mahakumbh_eval_metrics.csv'





In [15]:
pip install transformers datasets evaluate bert_score nltk sacrebleu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)


Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, colorama, sacrebleu, evaluate
Successfully installed colorama-0.4.6 evaluate-0.4.3 portalocker-3.1.1 sacrebleu-2.5.1 tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.
