In [1]:
!pip install transformers torch transformers datasets pandas numpy sentencepiece accelerate



In [3]:
import pandas as pd
import json
from typing import List, Dict

def convert_csv_to_training_format(csv_file: str) -> List[Dict]:
    df = pd.read_csv("/teamspace/studios/this_studio/Dataset/thirukkural.csv")
    training_data = []
    
    # Create different question patterns
    question_patterns = [
        "What is Thirukkural {number}?",
        "Tell me about the Thirukkural from {adikaram_name}",
        "What does Thirukkural say in {paul_name}?",
        "Explain the meaning of Thirukkural {number}",
        "What is the explanation of Thirukkural about {adikaram_name}?"
    ]
    
    for _, row in df.iterrows():
        response = {
            "Number": str(row['Number']),
            "kural": row['kural'],
            "mk": row['mk'],
            "explanation": row['explanation'],
            "adikaram_name": row['adikaram_name'],
            "iyal_name": row['iyal_name'],
            "paul_translation": row['paul_translation']
        }
        
        for pattern in question_patterns:
            sample = {
                "instruction": pattern.format(
                    number=row['Number'],
                    adikaram_name=row['adikaram_name'],
                    paul_name=row['paul_name']
                ),
                "response": response
            }
            training_data.append(sample)
    
    return training_data

In [4]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import torch

class ThirukkuralDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        text = (
            f"### Instruction: {item['instruction']}\n"
            f"### Response: {json.dumps(item['response'], ensure_ascii=False)}\n"
        )
        
        # Tokenizing
        encodings = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodings['input_ids'][0],
            'attention_mask': encodings['attention_mask'][0],
            'labels': encodings['input_ids'][0].clone()
        }

In [29]:
def setup_training(training_data: List[Dict]):
    # Initialize tokenizer and model
    model_name = "gpt2"  # Changed to GPT-2 which is compatible with AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    
    # Add special tokens for Tamil
    special_tokens = {
        "pad_token": "[PAD]",
        "sep_token": "[SEP]",
        "additional_special_tokens": ["[TAMIL]"]  # Added special token for Tamil text
    }
    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    # Split data into train and validation sets
    train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)
    
    # Create datasets with smaller max_length
    train_dataset = ThirukkuralDataset(train_data, tokenizer, max_length=128)
    val_dataset = ThirukkuralDataset(val_data, tokenizer, max_length=128)
    
    # Training arguments optimized for memory efficiency
    training_args = TrainingArguments(
        output_dir="./thirukkural_qa_model",
        num_train_epochs=5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=5e-5,
        fp16=True,
        gradient_checkpointing=True,
        logging_steps=10,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        optim="adamw_torch"
    )
    
    return model, tokenizer, train_dataset, val_dataset, training_args

In [30]:

def main():
    # Set memory efficient settings
    torch.cuda.empty_cache()  # Clear GPU cache
    
    # Environment variable for memory management
    import os
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=512'
    
    # Load and convert data
    training_data = convert_csv_to_training_format('thirukkural.csv')
    
    # Setup training with train and validation datasets
    model, tokenizer, train_dataset, val_dataset, training_args = setup_training(training_data)
    
    # Initialize trainer with both datasets
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    # Train
    trainer.train()
    
    # Save model and tokenizer
    trainer.save_model("./final_thirukkural_model")
    tokenizer.save_pretrained("./final_thirukkural_model")

In [31]:
def generate_answer(question: str, model, tokenizer):
    prompt = f"### Instruction: {question}\n### Response:"
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=256
    )
    
    outputs = model.generate(
        inputs.input_ids,
        max_length=300,
        temperature=0.7,
        num_beams=4,
        no_repeat_ngram_size=2,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse and format the response
    try:
        response_data = json.loads(response.split("### Response: ")[-1].strip())
        return format_response(response_data)
    except:
        return response

def format_response(response_data):
    return f"""
Kural: {response_data['kural']}
Meaning: {response_data['explanation']}
Chapter: {response_data['adikaram_name']}
Detailed Explanation: {response_data['mk']}
"""

In [33]:
def setup_training(training_data: List[Dict]):
    # Initialize tokenizer and model
    model_name = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=True
    )
    
    # Add special tokens for Tamil
    special_tokens = {
        "pad_token": "[PAD]",
        "sep_token": "[SEP]",
        "additional_special_tokens": ["[TAMIL]"]
    }
    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    # Split data into train and validation sets
    train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)
    
    # Create datasets
    train_dataset = ThirukkuralDataset(train_data, tokenizer, max_length=128)
    val_dataset = ThirukkuralDataset(val_data, tokenizer, max_length=128)
    
    # Modified training arguments to handle FP16 properly
    training_args = TrainingArguments(
        output_dir="./thirukkural_qa_model",
        num_train_epochs=5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=5e-5,
        fp16=False,  # Disabled FP16 training
        gradient_checkpointing=True,
        logging_steps=10,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        optim="adamw_torch",
        ddp_find_unused_parameters=False
    )
    
    return model, tokenizer, train_dataset, val_dataset, training_args

def main():
    # Clear GPU cache
    torch.cuda.empty_cache()
    
    # Load and convert data
    training_data = convert_csv_to_training_format('thirukkural.csv')
    
    # Setup training
    model, tokenizer, train_dataset, val_dataset, training_args = setup_training(training_data)
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    # Train
    trainer.train()
    
    # Save model and tokenizer
    trainer.save_model("./final_thirukkural_model")
    tokenizer.save_pretrained("./final_thirukkural_model")


In [34]:
if __name__ == "__main__":
    # Train the model
    main()
    
    # Load model for inference
    model_path = "./final_thirukkural_model"
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Example question
    question = "What is Thirukkural 1?"
    answer = generate_answer(question, model, tokenizer)
    print(answer)



Epoch,Training Loss,Validation Loss
1,2.3542,0.547887
2,1.9253,0.489656
3,1.8779,0.457259
4,1.8209,0.428936


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


### Instruction: What is Thirukkural 1?
### Response: {"Number": "1" "கு��������������પ�ম�্ �����ཱྀ�ు ༮��೿൯��ँ༯� ��ക����������� لۮ� �� � ८�९�म� ا��� ڮ� ��� য� � ��ी ��� �� ��� �� �� ���ক ���ಾ�� �: ��� ��� ��ᯀ���������� ���ମ�་
######### Question: Explain the meaning of Thiruvananthapuram Thirtieth Instruction #1256
## Response "1056", "Kural": "<iframe src="http://www.youtube.com/watch?feature=player_id=1179�1311ॿ?lang=en࿪����&lang�=��


**XML ROBERTA**

In [58]:
def setup_training(training_data: List[Dict]):
    # Option 1: XLM-RoBERTa (good for multilingual tasks)
    model_name = "xlm-roberta-base"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=True
    )

    class ThirukkuralDataset(torch.utils.data.Dataset):
        def __init__(self, data, tokenizer, max_length=128):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length
        
        def __getitem__(self, idx):
            item = self.data[idx]
            text = (
                f"[Q] {item['instruction']}\n"
                f"[K] {item['response']['kural']}\n"
                f"[A] {item['response']['explanation']}\n"
                f"[C] {item['response']['adikaram_name']}\n"
                f"[D] {item['response']['mk']}"
            )
            
            encodings = self.tokenizer(
                text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            return {
                'input_ids': encodings['input_ids'].squeeze(),
                'attention_mask': encodings['attention_mask'].squeeze(),
                'labels': encodings['input_ids'].squeeze()
            }
        
        def __len__(self):
            return len(self.data)

    class CustomDataCollator:
        def __init__(self, tokenizer):
            self.tokenizer = tokenizer

        def __call__(self, features):
            batch = {
                "input_ids": torch.stack([f["input_ids"] for f in features]),
                "attention_mask": torch.stack([f["attention_mask"] for f in features]),
                "labels": torch.stack([f["labels"] for f in features])
            }
            return batch

    # Split data and create datasets
    train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)
    train_dataset = ThirukkuralDataset(training_data, tokenizer, max_length=128)
    val_dataset = ThirukkuralDataset(val_data, tokenizer, max_length=128)
    
    training_args = TrainingArguments(
        output_dir="./kural_qa_model",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        warmup_steps=100,
        learning_rate=3e-5,
        fp16=True,
        gradient_checkpointing=True,
        logging_steps=100,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        optim="adamw_torch",
    )
    
    return model, tokenizer, train_dataset, val_dataset, training_args, CustomDataCollator(tokenizer)

    trainer.train()
    trainer.save_model("./xml_thirukkural_model")
    tokenizer.save_pretrained("./xml_thirukkural_model")

In [60]:
if __name__ == "__main__":
    # Train the model
    main()
    
    # Load model for inference
    model_path = "./teamspace/studios/this_studio/030125/thirukkural_qa_model/checkpoint-1248"
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Example question
    question = "What is Thirukkural 1?"
    answer = generate_answer(question, model, tokenizer)
    print(answer)

If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`


Epoch,Training Loss,Validation Loss
1,0.0254,0.000508
2,0.0062,4.3e-05
3,0.0036,2e-05


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


OSError: Incorrect path_or_model_id: './teamspace/studios/this_studio/030125/thirukkural_qa_model/checkpoint-1248'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [64]:
training_data = convert_csv_to_training_format('thirukkural.csv')


In [70]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

def setup_training(training_data: List[Dict]):
    # Using BERT-based model fine-tuned for question answering
    model_name = "bert-base-multilingual-cased"  # Good for Tamil language support
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    class ThirukkuralQADataset(Dataset):
        def __init__(self, data, tokenizer, max_length=384):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length
        
        def __getitem__(self, idx):
            item = self.data[idx]
            
            # Format context to include kural and its details
            context = (
                f"{item['response']['kural']}\n"
                f"{item['response']['explanation']}\n"
                f"Chapter: {item['response']['adikaram_name']}\n"
                f"Details: {item['response']['mk']}"
            )
            
            question = item['instruction']
            
            # Tokenize inputs
            encodings = self.tokenizer(
                question,
                context,
                max_length=self.max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            
            
            answer_start = context.find(item['response']['explanation'])
            answer_end = answer_start + len(item['response']['explanation'])
            
            # Convert character positions to token positions
            tokens = self.tokenizer.encode(context)
            start_token = len(self.tokenizer.encode(context[:answer_start]))
            end_token = len(self.tokenizer.encode(context[:answer_end])) - 1
            
            return {
                'input_ids': encodings['input_ids'].squeeze(),
                'attention_mask': encodings['attention_mask'].squeeze(),
                'start_positions': torch.tensor(start_token, dtype=torch.long),
                'end_positions': torch.tensor(end_token, dtype=torch.long)
            }
        
        def __len__(self):
            return len(self.data)

    # Split data and create datasets
    train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)
    train_dataset = ThirukkuralQADataset(train_data, tokenizer)
    val_dataset = ThirukkuralQADataset(val_data, tokenizer)
    
    training_args = TrainingArguments(
        output_dir="./thirukkural_qa_model",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        learning_rate=3e-5,
        fp16=True,
        gradient_checkpointing=True,
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        load_best_model_at_end=True,
    )
    
    return model, tokenizer, train_dataset, val_dataset, training_args

def get_answer(question: str, context: str, model, tokenizer):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        max_length=384,
        truncation=True,
        padding='max_length'
    )
    
    outputs = model(**inputs)
    
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)
    
    answer = tokenizer.decode(inputs['input_ids'][0][answer_start:answer_end+1])
    return answer

In [71]:
# Training
model, tokenizer, train_dataset, val_dataset, training_args = setup_training(training_data)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data,
    eval_dataset=val_dataset
)
trainer.train()

# Inference
question = "What is the meaning of Thirukkural 1?"
context = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு"
answer = get_answer(question, context, model, tokenizer)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,start_positions,end_positions,output_attentions,output_hidden_states,return_dict,start_positions,label_ids,label,end_positions.

In [76]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

def setup_training(training_data: List[Dict]):
    model_name = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    class ThirukkuralQADataset(Dataset):
        def __init__(self, data, tokenizer, max_length=384):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length
            self.processed_data = self._preprocess_data()
        
        def _preprocess_data(self):
            processed = []
            for item in self.data:
                # Create context from Thirukkural and its explanation
                context = (
                    f"{item['response']['kural']} "
                    f"{item['response']['explanation']} "
                    f"{item['response']['adikaram_name']} "
                    f"{item['response']['mk']}"
                )
                
                question = item['instruction']
                answer = item['response']['explanation']  
                
                answer_start = context.find(answer)
                if answer_start == -1:  
                    continue
                
                # Tokenize
                encoded = self.tokenizer(
                    question,
                    context,
                    max_length=self.max_length,
                    truncation=True,
                    stride=128,
                    return_overflowing_tokens=True,
                    return_offsets_mapping=True,
                    padding='max_length',
                    return_tensors='pt'
                )
                
                offset_mapping = encoded.pop('offset_mapping')[0]
                
                
                start_token = None
                end_token = None
                
                for idx, (start, end) in enumerate(offset_mapping):
                    if start <= answer_start < end:
                        start_token = idx
                    if start < answer_start + len(answer) <= end:
                        end_token = idx
                        break
                
                if start_token is not None and end_token is not None:
                    processed.append({
                        'input_ids': encoded['input_ids'][0],
                        'attention_mask': encoded['attention_mask'][0],
                        'start_positions': torch.tensor(start_token),
                        'end_positions': torch.tensor(end_token)
                    })
            
            return processed
        
        def __getitem__(self, idx):
            return self.processed_data[idx]
        
        def __len__(self):
            return len(self.processed_data)

    
    train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)
    train_dataset = ThirukkuralQADataset(training_data, tokenizer)
    val_dataset = ThirukkuralQADataset(val_data, tokenizer)
    
    training_args = TrainingArguments(
        output_dir="./thirukkural_qa_model",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        learning_rate=5e-5,
        fp16=False,
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        load_best_model_at_end=True,
    )
    
    return model, tokenizer, train_dataset, val_dataset, training_args

def main():
    training_data = convert_csv_to_training_format('thirukkural.csv')
    model, tokenizer, train_dataset, val_dataset, training_args = setup_training(training_data)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    trainer.train()
    trainer.save_model("./final_thirukkural_qa_model")
    tokenizer.save_pretrained("./final_thirukkural_qa_model")

In [77]:
if __name__ == "__main__":
    # Train the model
    main()
    
    # Load model for inference
    model_path = "./final_thirukkural_qa_model"
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Example question
    question = "What is Thirukkural 1?"
    answer = generate_answer(question, model, tokenizer)
    print(answer)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
100,2.7457,0.532353
200,0.0051,0.000226
300,0.0056,0.003053
400,0.0008,6.6e-05
500,0.0006,4.7e-05
600,0.0001,2.6e-05
700,0.0009,0.000753
800,0.0001,2.5e-05
900,0.0083,0.000527
1000,0.0001,2.3e-05


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at ./final_thirukkural_qa_model and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[ QUESTION ] What is Thirukkural 1? [ ANSWER ] Dem 癬TC compressionTCdromMEME CSSWh DVB ★ DVB gravimose SomME ■Worksdden DVBdrom MPOжит봄MEzomzomcelleME DVBWhMEcellecelleWorks DVB Steam ★Wh deem DVBzomitetdromdromzomdromzimdrom gravizom ang DVBME TERzomMEmosezom siderzomymezomgioszom MPO 툴 graviME angdromomodrom 툴dromgiosmtdrom angMEkam MPO DVB fertilezomsolaME essacellemosekur deemzomffincellezommosecelleymeomeMEWhmose शीतzom αυτάME unincorporatedmose 3iyme anggiosMEalt gravidromжитzom importanzazomvdzom DVB ang ■ DVBddenzimzom siezom varieMEallzom graviallMEderME LLWh fertileyme DVB DVB Som DVBWorks gravi DVBiterymedrommosedrom Selimzom modulzomiterVEcelle DVBmoseWhWhzom 3idden 3izomitatkurMEdromcelle angzomzimmose ang ETMEkurzomkamMEitervd ■ymeddenME servit graviyme शीतжит illzom mamzom 툴 diagramMEsoladromymME graviitetymemosemoseMEgiosdrom 3imoseomedromiter graviossenymeMEomezom LLME modulMEयातWhffenvdcelledrom sieossenME kamugiosvd DVBvd البعضzomddendrom herME her DVB CSSzom शीतosse

In [91]:
def setup_training(training_data: List[Dict]):
    model_name = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    class ThirukkuralQADataset(Dataset):
        def __init__(self, data, tokenizer, max_length=384):
            self.examples = []
            
            for item in data:
                try:
                    
                    context = (
                        f"{item['response']['kural']} "
                        f"{item['response']['explanation']}"
                    )
                    
                    
                    answer = item['response']['explanation'].strip()
                    question = item['instruction'].strip()
                    
                    
                    if not question or not answer:
                        continue
                    
                    # Tokenize
                    encoding = tokenizer(
                        question,
                        context,
                        max_length=max_length,
                        truncation='only_second',
                        padding='max_length',
                        return_tensors='pt',
                        return_offsets_mapping=True
                    )
                    
                    
                    answer_start = context.lower().find(answer.lower())
                    
                    if answer_start != -1:
                        
                        offset_mapping = encoding.pop('offset_mapping')[0]
                        
                        
                        start_token = None
                        end_token = None
                        
                        for idx, (start, end) in enumerate(offset_mapping):
                            if start <= answer_start < end:
                                start_token = idx
                            if start < answer_start + len(answer) <= end:
                                end_token = idx
                                break
                        
                        if start_token is not None and end_token is not None:
                            self.examples.append({
                                'input_ids': encoding['input_ids'][0],
                                'attention_mask': encoding['attention_mask'][0],
                                'start_positions': torch.tensor(start_token),
                                'end_positions': torch.tensor(end_token)
                            })
                
                except Exception as e:
                    print(f"Error processing item: {e}")
                    continue
            
            print(f"Created dataset with {len(self.examples)} examples")
        
        def __getitem__(self, idx):
            return self.examples[idx]
        
        def __len__(self):
            return len(self.examples)

    
    print(f"Sample training data item: {training_data[0]}")
    
    
    train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)
    
    print(f"Training data size: {len(train_data)}")
    print(f"Validation data size: {len(val_data)}")
    
    train_dataset = ThirukkuralQADataset(training_data, tokenizer)
    val_dataset = ThirukkuralQADataset(val_data, tokenizer)
    
    training_args = TrainingArguments(
        output_dir="./roberta_qa_model",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_ratio=0.1,
        learning_rate=5e-5,
        fp16=False,
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none"
    )
    
    return model, tokenizer, train_dataset, val_dataset, training_args

def main():
    # Load and print sample of training data
    training_data = convert_csv_to_training_format('thirukkural.csv')
    print(f"Total data items: {len(training_data)}")
    
    model, tokenizer, train_dataset, val_dataset, training_args = setup_training(training_data)
    
    if len(train_dataset) == 0:
        print("Dataset processing failed. Checking data format...")
        print(f"Sample data structure: {training_data[0] if training_data else 'No data'}")
        raise ValueError("Training dataset is empty. Check data preprocessing.")
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    
    trainer.train()
    trainer.save_model("./roberta_qa_model")
    tokenizer.save_pretrained("./roberta_qa_model")

In [86]:
if __name__ == "__main__":
    # Train the model
    main()
    
    # Load model for inference
    model_path = "./roberta_qa_model"
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Example question
    question = "What is Thirukkural 1?"
    answer = generate_answer(question, model, tokenizer)
    print(answer)

Total data items: 6650
Sample training data item: {'instruction': 'What is Thirukkural 1?', 'response': {'Number': '1', 'kural': 'அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு.', 'mk': 'அகரம் எழுத்துக்களுக்கு முதன்மை; ஆதிபகவன், உலகில் வாழும் உயிர்களுக்கு முதன்மை', 'explanation': 'As the letter A is the first of all letters, so the eternal God is first in the world', 'adikaram_name': 'கடவுள் வாழ்த்து', 'iyal_name': 'பாயிரவியல்', 'paul_translation': 'Virtue'}}
Training data size: 5985
Validation data size: 665
Created dataset with 5985 examples
Created dataset with 665 examples




Epoch,Training Loss,Validation Loss
1,0.0141,0.002845
2,0.0039,0.00053
3,0.0004,5e-06


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at ./roberta_qa_model and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vre boostersarovnaire Burnett Burnettogle Cobbnaireogle BALL Burnettnaire Patel tob Hoover electorsbanksBALL ZuckerbergnaireBALL Guerreronaireachusnaire BALLogle Gardnernaire Gardner gluc Burnett WelchAfeeNumbersnaire IDsogleBALL boosters boosters gluc Welch Cobb Cobb KR KRBALL Patelnaire blown IDsbanks BALL Patel IDsAfee BALLBloodnaire boosters IDs IDs messenger CobbBoost blown Gardner IDs methamphetamine IDs Gardner CobbVOL Cobb IDs EEGogle IDs gluc BALLCOL BALL gluc Cobb tacos conjectureogle radius boostersogle boosters EEG IDs nervesAfeearov radius IDsarov BALLQuestionsyu radius messenger IDsabis Cobb Burnettabisemakeremaker BALL boosters Burnettemaker radius EEG Burnett Mayer KernarovBALL radius buffalo Burnett BALLVOLabis IDs Burnett Cobbabis BALL Jindal BurnettBALL BALL gunshots boosters BALL Cobb buffalo Gardner Burnettarov Gardner boosters GardnerAfeeabisabis radiusabisachusachus Gardner Gardnerewski egobanksabis messenger boostersabis Falcon Cobb BALL formulationsogleabis buf

In [96]:


from transformers import EvalPrediction
import numpy as np
import evaluate
import torch

def compute_metrics(eval_pred: EvalPrediction):
    metric = evaluate("squad")
    
    predictions, labels = eval_pred
    
    
    start_logits, end_logits = predictions
    
    
    predicted_answers = []
    for start_logit, end_logit in zip(start_logits, end_logits):
        start_idx = np.argmax(start_logit)
        end_idx = np.argmax(end_logit)
        
        
        if end_idx >= start_idx:
            predicted_answers.append({
                'prediction_text': '',  
                'start_index': start_idx,
                'end_index': end_idx
            })
        else:
            predicted_answers.append({
                'prediction_text': '',
                'start_index': start_idx,
                'end_index': start_idx
            })
    
    
    references = [
        {'answers': {'answer_start': [label[0]], 'text': ['']}  
        } for label in labels
    ]
    
    return metric.compute(predictions=predicted_answers, references=references)

class QATrainerWithMetrics(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.metric = evaluate.load("squad")
    
    def evaluate(self, eval_dataset=None, ignore_keys=None):
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        
        total_exact_match = 0
        total_f1 = 0
        total_samples = 0
        
        for batch in eval_dataloader:
            with torch.no_grad():
                outputs = self.model(**batch)
            
            start_logits = outputs.start_logits.cpu().numpy()
            end_logits = outputs.end_logits.cpu().numpy()
            
            for i in range(len(start_logits)):
                pred_start = np.argmax(start_logits[i])
                pred_end = np.argmax(end_logits[i])
                
                true_start = batch['start_positions'][i].item()
                true_end = batch['end_positions'][i].item()
                
                # Calculate exact match
                exact_match = (pred_start == true_start) and (pred_end == true_end)
                total_exact_match += int(exact_match)
                
                # Calculate F1 score
                pred_tokens = set(range(pred_start, pred_end + 1))
                true_tokens = set(range(true_start, true_end + 1))
                
                common_tokens = pred_tokens.intersection(true_tokens)
                if len(pred_tokens) == 0 or len(true_tokens) == 0:
                    f1 = 0
                else:
                    precision = len(common_tokens) / len(pred_tokens)
                    recall = len(common_tokens) / len(true_tokens)
                    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
                
                total_f1 += f1
                total_samples += 1
        
        metrics = {
            'exact_match': total_exact_match / total_samples * 100,
            'f1': total_f1 / total_samples * 100
        }
        
        return metrics

# main function with metrics
def main():
    training_data = convert_csv_to_training_format('thirukkural.csv')
    model, tokenizer, train_dataset, val_dataset, training_args = setup_training(training_data)
    
    trainer = QATrainerWithMetrics(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
    
    # Train and evaluate
    trainer.train()
    
    #evaluation metrics
    eval_metrics = trainer.evaluate()
    
    print("\nEvaluation Metrics:")
    print(f"Exact Match: {eval_metrics['exact_match']:.2f}%")
    print(f"F1 Score: {eval_metrics['f1']:.2f}%")
    
    # Save model and metrics
    trainer.save_model("./final_thirukkural_qa_model")
    tokenizer.save_pretrained("./final_thirukkural_qa_model")
    
    # Save metrics to file
    with open("./final_thirukkural_qa_model/metrics.txt", "w") as f:
        f.write(f"Exact Match: {eval_metrics['exact_match']:.2f}%\n")
        f.write(f"F1 Score: {eval_metrics['f1']:.2f}%\n")

In [100]:
def compute_metrics(eval_pred: EvalPrediction):
    metric = evaluate.load("squad")
    predictions, labels = eval_pred

    
    start_logits, end_logits = predictions

    
    predicted_answers = []
    for i in range(len(start_logits)):
        start_idx = np.argmax(start_logits[i])
        end_idx = np.argmax(end_logits[i])
        predicted_answers.append({
            "id": str(i),  
            "prediction_text": "",  
        })

    # Create references
    references = []
    for i, label in enumerate(labels):
        references.append({
            "id": str(i), 
            "answers": {
                "text": [""],  
                "answer_start": [0], 
            }
        })

    # Ensure predictions and references have the same length
    assert len(predicted_answers) == len(references), \
        f"Number of predictions ({len(predicted_answers)}) and references ({len(references)}) must match!"

    return metric.compute(predictions=predicted_answers, references=references)


In [101]:
if __name__ == "__main__":
    main()

Sample training data item: {'instruction': 'What is Thirukkural 1?', 'response': {'Number': '1', 'kural': 'அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு.', 'mk': 'அகரம் எழுத்துக்களுக்கு முதன்மை; ஆதிபகவன், உலகில் வாழும் உயிர்களுக்கு முதன்மை', 'explanation': 'As the letter A is the first of all letters, so the eternal God is first in the world', 'adikaram_name': 'கடவுள் வாழ்த்து', 'iyal_name': 'பாயிரவியல்', 'paul_translation': 'Virtue'}}
Training data size: 5985
Validation data size: 665
Created dataset with 6650 examples
Created dataset with 665 examples




Epoch,Training Loss,Validation Loss


AssertionError: Number of predictions (665) and references (2) must match!