In [2]:
!pip install sentence-transformers

  pid, fd = os.forkpty()


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch
import os
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForQuestionAnswering, 
    TrainingArguments, 
    Trainer
)

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [4]:
def load_and_split_data(
    input_csv, 
    train_ratio=0.7, 
    test_ratio=0.15, 
    random_seed=42
):
   
    df = pd.read_csv(input_csv)
    df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    total_samples = len(df)
    train_end = int(total_samples * train_ratio)
    val_end = train_end + int(total_samples * test_ratio)
    
    train_df = df.iloc[:train_end]
    val_df = df.iloc[train_end:val_end]
    test_df = df.iloc[val_end:]
    
    def transform_subset(subset_df):
        records = []
        for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), 
                            desc="Transforming data"):
            answer_start = row['context'].find(row['answer']) \
                if not row['is_impossible'] else -1
            
            record = {
                "id": row['id'],
                "title": row['title'],
                "context": row['context'],
                "question": row['question'],
                "answers": {
                    "text": [row['answer']] if not row['is_impossible'] else [],
                    "answer_start": [answer_start] if not row['is_impossible'] else []
                },
                "is_impossible": row['is_impossible'],
                "original_answer": row['answer']
            }
            records.append(record)
        return records
    
    train_records = transform_subset(train_df)
    val_records = transform_subset(val_df)
    test_records = transform_subset(test_df)
    
    train_dataset = Dataset.from_pandas(pd.DataFrame(train_records))
    val_dataset = Dataset.from_pandas(pd.DataFrame(val_records))
    test_dataset = Dataset.from_pandas(pd.DataFrame(test_records))
    
    return DatasetDict({
        "train": train_dataset, 
        "validation": val_dataset, 
        "test": test_dataset
    })

In [5]:
from transformers import AutoTokenizer

def preprocess_function(tokenizer, examples):
    
    tokenized = tokenizer(
        examples["question"], 
        examples["context"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )
    
    start_positions = []
    end_positions = []
    for answers in examples["answers"]:
        if answers["text"]:
            start = answers["answer_start"][0]
            end = start + len(answers["text"][0])
        else:
            start = 0
            end = 0
        start_positions.append(start)
        end_positions.append(end)
    
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    
    return tokenized

In [14]:
import torch
import shutil
import os
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

def train_qa_model(
    data, 
    model_name="roberta-base", 
    num_epochs=1,
    learning_rate=2e-5,
    batch_size=32
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    processed_data = data.map(
        lambda x: preprocess_function(tokenizer, x), 
        batched=True, 
        remove_columns=data["train"].column_names
    )
    
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,  
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_data["train"],
        eval_dataset=processed_data["validation"],
        tokenizer=tokenizer,
    )
    
    trainer.train()
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")
    zip_path = "/kaggle/working/fine_tuned_model.zip"
    shutil.make_archive("/kaggle/working/fine_tuned_model", 'zip', "./fine_tuned_model")
    print(f"Model saved and zipped to {zip_path}")
    return trainer, tokenizer, model

In [15]:
input_csv = "/kaggle/input/sindhi/SQuAD_Translated_Sindhi.csv"
data = load_and_split_data(input_csv)
trainer, tokenizer, model = train_qa_model(data)

Transforming data: 100%|██████████| 3500/3500 [00:00<00:00, 15580.95it/s]
Transforming data: 100%|██████████| 750/750 [00:00<00:00, 15055.87it/s]
Transforming data: 100%|██████████| 750/750 [00:00<00:00, 15144.15it/s]
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.0636,3.170593


Model saved and zipped to /kaggle/working/fine_tuned_model.zip


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_qa_model(
    model, 
    tokenizer, 
    test_dataset, 
    device=None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model.to(device)
    model.eval()
    
    exact_match_scores = []
    cosine_sim_scores = []
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    with torch.no_grad():
        for example in tqdm(test_dataset, desc="Evaluating"):
            inputs = tokenizer(
                example['question'], 
                example['context'], 
                return_tensors='pt', 
                max_length=512, 
                truncation=True
            ).to(device)
            
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            
            start_idx = torch.argmax(start_logits).item()
            end_idx = torch.argmax(end_logits).item()
            input_ids = inputs['input_ids'][0]
            predicted_answer_tokens = input_ids[start_idx:end_idx+1]
            predicted_answer = tokenizer.decode(predicted_answer_tokens).strip()
            
            ground_truth = example['original_answer']
            exact_match_scores.append(int(predicted_answer.lower() == ground_truth.lower()))
            
            pred_emb = sentence_model.encode([predicted_answer])
            gt_emb = sentence_model.encode([ground_truth])
            cosine_sim_scores.append(cosine_similarity(pred_emb, gt_emb)[0][0])
    
    metrics = {
        "exact_match_rate": np.mean(exact_match_scores),
        "avg_cosine_similarity": np.mean(cosine_sim_scores),
        "cosine_sim_std": np.std(cosine_sim_scores)
    }
    return metrics

In [None]:
test_metrics = evaluate_qa_model(model, tokenizer, data["test"])
print("Evaluation Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value}")

In [21]:
print(data)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible', 'original_answer'],
        num_rows: 3500
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible', 'original_answer'],
        num_rows: 750
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible', 'original_answer'],
        num_rows: 750
    })
})


In [22]:
from transformers import pipeline
import numpy as np

def display_results_qa_model( tokenizer, model, dataset, num_examples=5):
    """
    Evaluate the fine-tuned model on a few examples.
    """
    # Load fine-tuned model into a QA pipeline
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    # Select the first `num_examples` from the validation dataset
    examples = dataset["validation"].select(range(num_examples))
    
    print("Evaluating on 5 examples...\n")
    
    for i, example in enumerate(examples):
        context = example['context']
        question = example['question']
        actual_answer = example['original_answer']  # First answer in the list
        
        # Get the model's prediction
        prediction = qa_pipeline(question=question, context=context)
        predicted_answer = prediction['answer']
        
        # Print the details
        print(f"Example {i+1}:")
        print(f"Context: {context}\n")
        print(f"Question: {question}")
        print(f"Predicted Answer: {predicted_answer}")
        print(f"Actual Answer: {actual_answer}")
        print(f"Score: {prediction['score']:.4f}")
        print("-" * 80)

# Example usage
# trainer, tokenizer, model = train_qa_model(data)
display_results_qa_model( tokenizer, model, data, num_examples=5)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Evaluating on 5 examples...

Example 1:
Context: 1973ع جي چونڊن کان پوءِ سوزلينڊ جو آئين بادشاهه سوڀوزا II معطل ڪيو، جنهن بعد ۾ 1982ع ۾ پنهنجي وفات تائين فرمان ذريعي ملڪ تي حڪومت ڪئي. ان وقت سوڀوزا II 61 سالن تائين سوزيلينڊ تي حڪومت ڪئي، جنهن کيس تاريخ جو سڀ کان ڊگهو حڪمران بڻائي ڇڏيو. هن جي موت جي پٺيان هڪ ريجنسي آئي، جنهن ۾ ”راڻي ريجنٽ ڊيزيلي شونگوي“ 1984ع تائين رياست جي سربراهه رهي، جڏهن هوءَ ليڪوڪو طرفان هٽائي وئي ۽ ان جي جاءِ تي راڻي ماءُ نٽفومبي ٽفوالا مقرر ڪئي وئي. Mswati III، Ntfombi جو پٽ، 25 اپريل 1986 تي سوازيلينڊ جي بادشاهه ۽ انگونياما جي حيثيت سان تاج ڪيو ويو.

Question: سوزيلينڊ ۾ 61 سالن تائين ڪهڙي رينٽ حڪومت ڪئي؟
Predicted Answer: .
Actual Answer: راڻي ريجنٽ ڊيزيلي شونگوي
Score: 0.0000
--------------------------------------------------------------------------------
Example 2:
Context: اسلام ڏکڻ اوڀر ايشيا ۾ سڀ کان وڏي پيماني تي استعمال ٿيندڙ مذهب آهي، جنهن ۾ تقريبن 240 ملين پيروڪار آهن جيڪي سڄي آبادي جو 40 سيڪڙو تائين ترجمو ڪن ٿا، انڊونيشيا، برونائي، ملائيشيا ۽ ڏاکڻي فل