In [5]:
pip install transformers datasets peft accelerate bitsandbytes huggingface_hub




Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install transformers --upgrade

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip show transformers


Name: transformers
Version: 4.52.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /mnt/Data/sarmistha/.miniconda3/envs/finbot/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: bert-score, kvpress, peft, sentence-transformers, trl, unsloth_zoo
Note: you may need to restart the kernel to use updated packages.


In [None]:
#Fine tuning gemma

In [4]:
from huggingface_hub import login

login(token="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") 

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from transformers import DataCollatorForLanguageModeling

# CONFIG
model_id = "google/gemma-3-1b-it"
dataset_id = "ishani29/mahakumbh-news-summarization"
split = "train"
max_input_length = 512
max_target_length = 150
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",       # 🚀 Handles device placement correctly
    torch_dtype="auto"       # or torch.float16 if you want faster inference
)

# Apply PEFT LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # target attention modules (adjust for Gemma)
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Load dataset
dataset = load_dataset(dataset_id, split=split)

# Preprocess data
# def preprocess_function(example):
#     prompt = f"Summarize the following news article:\n{example['text']}\nSummary:"
#     inputs = tokenizer(prompt, max_length=max_input_length, truncation=True)
#     targets = tokenizer(example["summary"], max_length=max_target_length, truncation=True)
#     inputs["labels"] = targets["input_ids"]
#     return inputs
def preprocess_function(example):
    prompt = prompt = (
    "You are a helpful assistant trained to summarize Indian news articles concisely in less than or equal to 100 words.\n\n"
    "Article:\n{example['text']}\n\n"
    "Write a clear, factual, and concise summary and ensure that no noisy statements are added:"
)

    
    # Tokenize inputs and targets with padding
    model_inputs = tokenizer(
        prompt,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["summary"],
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

    model_inputs["labels"] = labels["input_ids"]
    return {k: v.squeeze(0) for k, v in model_inputs.items()}


tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gemma-lora-summary",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=True,
    logging_dir="./logs",
    save_strategy="epoch",
    report_to="none",  # disable wandb
    logging_steps=10,
    push_to_hub=True,
    hub_model_id="ishani29/gemma-summary-lora",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
trainer.train()


trainable params: 745,472 || all params: 1,000,631,424 || trainable%: 0.0745


Map:   0%|          | 0/851 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
10,4.6748
20,4.3295
30,3.844
40,3.3455
50,2.8639
60,2.4104
70,1.9327
80,1.4327
90,0.9877
100,0.68


TrainOutput(global_step=1278, training_loss=0.4797283780406898, metrics={'train_runtime': 361.343, 'train_samples_per_second': 7.065, 'train_steps_per_second': 3.537, 'total_flos': 5479317016805376.0, 'train_loss': 0.4797283780406898, 'epoch': 3.0})

In [6]:
trainer.push_to_hub()


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/ishani29/gemma-summary-lora/commit/13b21d7ae414175a3dc393625f0a716044e2801f', commit_message='End of training', commit_description='', oid='13b21d7ae414175a3dc393625f0a716044e2801f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ishani29/gemma-summary-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='ishani29/gemma-summary-lora'), pr_revision=None, pr_num=None)

In [None]:
#Inference

In [26]:
import pandas as pd
import evaluate
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ----- CONFIG -----
csv_input_path = "mahakumbh_test_predictions_gemmaaaa.csv"  # Change if your CSV has a different name
eval_output_path = "mahakumbh_eval_metrics_gemmaaaa.csv"
model_id = "ishani29/gemma-summary-lora"  # Replace with your Gemma model checkpoint ID if calculating perplexity

# ----- LOAD DATA -----
df = pd.read_csv(csv_input_path)
references = df["reference_summary"].tolist()
predictions = df["generated_summary"].tolist()
articles = df["article"].tolist()

# ----- EVALUATION METRICS -----
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

print("🔍 Computing BLEU...")
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print("🔍 Computing ROUGE...")
rouge_score = rouge.compute(predictions=predictions, references=references)

print("🔍 Computing METEOR...")
meteor_score = meteor.compute(predictions=predictions, references=references)

print("🔍 Computing BERTScore...")
# bert_score = bertscore.compute(predictions=predictions, references=references, lang="en")

# # ----- (OPTIONAL) PERPLEXITY -----
# try:
#     print("🔍 Calculating Perplexity (optional)...")
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
#     model.eval()

#     perplexities = []
#     for article, reference in tqdm(zip(articles, references), total=len(articles), desc="Calculating Perplexity"):
#         inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=512).to(model.device)
#         labels = tokenizer(reference, return_tensors="pt", truncation=True, max_length=150).input_ids.to(model.device)

#         with torch.no_grad():
#             loss = model(input_ids=inputs.input_ids, labels=labels).loss
#             perplexities.append(torch.exp(loss).item())

#     df["perplexity"] = perplexities
#     avg_perplexity = sum(perplexities) / len(perplexities)
# except Exception as e:
#     print(f"⚠️ Perplexity skipped due to error: {e}")
#     avg_perplexity = None

# ----- SAVE METRICS -----
metrics = {
    "BLEU": bleu_score["score"] / 100,
    "ROUGE-1": rouge_score["rouge1"],
    "ROUGE-2": rouge_score["rouge2"],
    "ROUGE-L": rouge_score["rougeL"],
    "METEOR": meteor_score["meteor"],
#     "BERTScore_F1": sum(bert_score["f1"]) / len(bert_score["f1"]),
#     "Avg Perplexity": avg_perplexity if avg_perplexity is not None else "N/A"
}

pd.DataFrame([metrics]).to_csv(eval_output_path, index=False)

# ----- PRINT METRICS -----
print("\n📊 Evaluation Summary:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")
print(f"\n✅ Evaluation metrics saved to: '{eval_output_path}'")


[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /mnt/Data/sarmistha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🔍 Computing BLEU...
🔍 Computing ROUGE...
🔍 Computing METEOR...
🔍 Computing BERTScore...

📊 Evaluation Summary:
BLEU: 0.0545
ROUGE-1: 0.2844
ROUGE-2: 0.0944
ROUGE-L: 0.1818
METEOR: 0.2701

✅ Evaluation metrics saved to: 'mahakumbh_eval_metrics_gemmaaaa.csv'


In [15]:
pip install transformers datasets evaluate bert_score nltk sacrebleu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)


Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, colorama, sacrebleu, evaluate
Successfully installed colorama-0.4.6 evaluate-0.4.3 portalocker-3.1.1 sacrebleu-2.5.1 tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.
