In [1]:
pip install datasets transformers sentencePiece

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import Dataset

import json

# Load your Thirukkural dataset
dataset_path = "/teamspace/studios/this_studio/dataset/thirukkural.json"
with open(dataset_path, "r", encoding="utf-8") as file:
    thirukkural_data = json.load(file)

print(f"Loaded {len(thirukkural_data)} Thirukkural entries.")

# Prepare data for fine-tuning
hf_data = Dataset.from_list([
    {"instruction": entry["instruction"], "response": json.dumps(entry["response"])}
    for entry in thirukkural_data
])


Loaded 1330 Thirukkural entries.


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load T5 small model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize data
def preprocess_data(example):
    inputs = tokenizer(
        example["instruction"], truncation=True, padding="max_length", max_length=512
    )
    labels = tokenizer(
        example["response"], truncation=True, padding="max_length", max_length=512
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = hf_data.map(preprocess_data)

# Fine-tuning 
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
)

# InitTrainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train model
trainer.train()




You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/1330 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.565177
2,0.878500,0.526704
3,0.878500,0.512214


TrainOutput(global_step=999, training_loss=0.7392757137019832, metrics={'train_runtime': 242.4368, 'train_samples_per_second': 16.458, 'train_steps_per_second': 4.121, 'total_flos': 540013787873280.0, 'train_loss': 0.7392757137019832, 'epoch': 3.0})

In [4]:
# Save ft model
model.save_pretrained("./t5_finetuned")
tokenizer.save_pretrained("./t5_finetuned")


('./t5_finetuned/tokenizer_config.json',
 './t5_finetuned/special_tokens_map.json',
 './t5_finetuned/spiece.model',
 './t5_finetuned/added_tokens.json')

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("./t5_finetuned")
tokenizer = T5Tokenizer.from_pretrained("./t5_finetuned")

# Generate a response
def generate_response(instruction):
    inputs = tokenizer(instruction, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(inputs["input_ids"], max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example question
instruction = "Provide all details about Kural number 3."
response = generate_response(instruction)

print("Generated Response:")
print(response)


Generated Response:
"Number": "3", "kural": "u0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcd u0ba4u0bc1u0baeu0bcdu0ba9u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcd u0ba4u0bc1u0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcdu0ba4u0bcd


In [1]:
# necessary packages 
!pip install sentence-transformers indic-nlp-library

# libraries
from sentence_transformers import SentenceTransformer, util
import json
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

# Load the Tamil Sentence Transformer model (Indic-BERT)
model = SentenceTransformer("ai4bharat/indic-bert")

# Load the Thirukkural dataset
dataset_path = "/teamspace/studios/this_studio/dataset/thirukkural.json"
with open(dataset_path, "r", encoding="utf-8") as file:
    thirukkural_data = json.load(file)

# Preprocess the dataset to extract instructions and responses
instructions = [entry["instruction"] for entry in thirukkural_data]
responses = [entry["response"] for entry in thirukkural_data]

# Encode the instructions into embeddings
instruction_embeddings = model.encode(instructions, convert_to_tensor=True)

# normalize Tamil text
normalizer = IndicNormalizerFactory().get_normalizer("ta")

def normalize_input(text):
    """
    Normalize Tamil text to improve matching accuracy.
    """
    return normalizer.normalize(text)

# Define the retrieval function to fetch the best match
def retrieve_best_match(question):
    """
    Given a question, retrieve the best matching Kural from the dataset.
    """
    # Normalize the question
    normalized_question = normalize_input(question)
    
    # Encode the question to get its embedding
    question_embedding = model.encode(normalized_question, convert_to_tensor=True)
    
    # Compute cosine similarities between the question and all instructions
    similarities = util.cos_sim(question_embedding, instruction_embeddings)
    
    # Find the best match (highest similarity score)
    best_match_idx = similarities.argmax().item()
    return responses[best_match_idx]

# Test the retrieval system
question = "What is the meaning of Kural number 1?"
best_response = retrieve_best_match(question)

# Display the retrieved response
print("Retrieved Response:")
print(f"Number: {best_response['Number']}")
print(f"Kural: {best_response['kural']}")
print(f"Meaning (MK): {best_response['mk']}")
print(f"Explanation: {best_response['explanation']}")
print(f"Adikaram: {best_response['adikaram_name']}")
print(f"Iyal: {best_response['iyal_name']}")

# Optional - Retrieve top N matches for ranking
def retrieve_top_matches(question, top_n=3):
    """
    Retrieve the top N best matching Kurals for a given question.
    """
    normalized_question = normalize_input(question)
    question_embedding = model.encode(normalized_question, convert_to_tensor=True)
    similarities = util.cos_sim(question_embedding, instruction_embeddings)
    
    # Get top N matches
    top_indices = similarities.topk(top_n).indices.tolist()
    return [responses[idx] for idx in top_indices]

# Example matches
top_matches = retrieve_top_matches(question, top_n=1)
for idx, match in enumerate(top_matches, 1):
    print(f"Match {idx}:")
    print(f"Number: {match['Number']}")
    print(f"Kural: {match['kural']}")
    print(f"Explanation: {match['explanation']}")
    print("------")


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-8.1.3-py3-none-any.whl.metadata (6.4 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.21.2-py3-none-any.whl.metadata (2.8 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Col

No sentence-transformers model found with name ai4bharat/indic-bert. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

Retrieved Response:
Number: 5
Kural: இருள்சேர் இருவினையும் சேரா இறைவன் பொருள்சேர் புகழ்புரிந்தார் மாட்டு.
Meaning (MK): இறைவன் என்பதற்குரிய பொருளைப் புரிந்து கொண்டு புகழ் பெற விரும்புகிறவர்கள், நன்மை தீமைகளை ஒரே அளவில் எதிர் கொள்வார்கள்
Explanation: The two-fold deeds that spring from darkness shall not adhere to those who delight in the true praise of God
Adikaram: கடவுள் வாழ்த்து
Iyal: பாயிரவியல்


TypeError: list indices must be integers or slices, not list

In [6]:
# encoding when saving the dataset
import json

with open("/teamspace/studios/this_studio/dataset/thirukkural.json", "w", encoding="utf-8") as file:
    json.dump(thirukkural_data, file, ensure_ascii=False, indent=4)


In [7]:
text = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு."
tokenized = tokenizer.encode(text, return_tensors="pt")
decoded = tokenizer.decode(tokenized[0])

print("Original Text:", text)
print("Tokenized:", tokenized)
print("Decoded:", decoded)


Original Text: அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு.
Tokenized: tensor([[3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 5, 1]])
Decoded: <unk> <unk> <unk> <unk> <unk> <unk> <unk>.</s>


In [None]:
outputs = model.generate(
    inputs["input_ids"],
    max_length=512,
    num_beams=4,
    early_stopping=True,
    repetition_penalty=1.2,  # Penalize repeated tokens
    length_penalty=1.0,      # Prefer natural-length sentences
    no_repeat_ngram_size=2   # Avoid repetitive n-grams
)
