In [1]:
!pip install -U langchain langchain-community langchain-huggingface transformers accelerate
!pip install -q datasets sentencepiece gradio torch peft huggingface_hub
!pip install transformers datasets sentencepiece torch accelerate gradio --quiet


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-

In [2]:
!pip install transformers gradio torch --quiet

In [3]:
from transformers import pipeline
import gradio as gr


In [None]:
import os, json, random
import torch
from pathlib import Path
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, pipeline
)
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [None]:
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# Base examples
base_pairs = [
    {"question": "'राम' शब्दस्य प्रथमा एकवचनं किम्?", "answer": "रामः"},
    {"question": "'रामम्' इति का विभक्तिः?", "answer": "द्वितीया विभक्तिः"},
    {"question": "सीता का लिङ्गः?", "answer": "स्त्रीलिङ्गः"},
    {"question": "What is 'vibhakti' in Sanskrit grammar?", "answer": "विभक्तिः = case inflection (7 cases)."},
    {"question": "गजस्य बहुवचन प्रथमा किम्?", "answer": "गजाः"},
    {"question": "What is sandhi (सन्धि)?", "answer": "सन्धिः = euphonic combination."},
]

# Add a few more grammar exercises
grammar_exercises = [
    {"question": "रामस्य सम्बोधन किम्?", "answer": "हे राम!"},
    {"question": "Translate: 'The boy goes to school' in Sanskrit.", "answer": "बालकः विद्यालयं गच्छति।"},
    {"question": "What is a dhātu (धातु)?", "answer": "धातुः = verb root, e.g. √गम् = to go."},
    {"question": "What is समास?", "answer": "समासः = compound word formation, e.g. राजपुत्रः."},
]

augmented = base_pairs + grammar_exercises

# Expand to ~200 examples by random sampling
while len(augmented) < 200:
    augmented.append(random.choice(augmented))

random.shuffle(augmented)

# Split train/valid
split_idx = int(0.85 * len(augmented))
train_data, valid_data = augmented[:split_idx], augmented[split_idx:]

def save_jsonl(data_list, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        for item in data_list:
            json_line = {"input": item["question"], "output": item["answer"]}
            f.write(json.dumps(json_line, ensure_ascii=False) + "\n")

save_jsonl(train_data, DATA_DIR / "train.jsonl")
save_jsonl(valid_data, DATA_DIR / "valid.jsonl")

print(f"Saved {len(train_data)} training and {len(valid_data)} validation examples")


Saved 170 training and 30 validation examples


In [None]:
data_files = {"train": "data/train.jsonl", "validation": "data/valid.jsonl"}
datasets = load_dataset("json", data_files=data_files)
print(datasets)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 170
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 30
    })
})


In [None]:
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input_length = 64
max_target_length = 64

def preprocess(batch):
    model_inputs = tokenizer(batch["input"], max_length=max_input_length,
                             truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["output"], max_length=max_target_length,
                           truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = datasets.map(preprocess, batched=True, remove_columns=["input", "output"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/170 [00:00<?, ? examples/s]



Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5-sanskrit-chatbot",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,   # Increased epochs for better training
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=20,
    push_to_hub=False
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,50.4064,42.484966
2,47.6567,36.942089
3,45.5123,34.811604
4,45.3301,33.778103
5,44.4227,33.41243


TrainOutput(global_step=110, training_loss=46.427593716708095, metrics={'train_runtime': 92.1085, 'train_samples_per_second': 9.228, 'train_steps_per_second': 1.194, 'total_flos': 56179654656000.0, 'train_loss': 46.427593716708095, 'epoch': 5.0})

In [None]:
trainer.save_model("sanskrit_finetuned_mt5")
tokenizer.save_pretrained("sanskrit_finetuned_mt5")

('sanskrit_finetuned_mt5/tokenizer_config.json',
 'sanskrit_finetuned_mt5/special_tokens_map.json',
 'sanskrit_finetuned_mt5/spiece.model',
 'sanskrit_finetuned_mt5/added_tokens.json',
 'sanskrit_finetuned_mt5/tokenizer.json')

In [None]:
# Cell 11: Inference

from transformers import pipeline

# Local model path (make sure this folder exists after training)
model_path = "./sanskrit_finetuned_mt5"

# Load pipeline from local directory
generator = pipeline(
    "text2text-generation",
    model=model_path,
    tokenizer=model_path,
    device=0  # use GPU
)

# Example query
query = "राम शब्दस्य प्रथमा एकवचनं किम्?"

# Generate answer
output = generator(
    query,
    max_new_tokens=50,
    num_beams=4,
    early_stopping=True
)

print("Input:", query)
print("Output:", output[0]["generated_text"])

Device set to use cuda:0


Input: राम शब्दस्य प्रथमा एकवचनं किम्?
Output: <extra_id_0>


In [4]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("# 📖 Sanskrit Grammar Chatbot (Fine-tuned)")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Ask me in Sanskrit or English...")
    clear = gr.Button("Clear")

    def respond(message, chat_history):
        bot_reply = ask_bot(message)
        chat_history.append((message, bot_reply))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()
