In [None]:
# pip install datasets transformers torch
!pip install sentence-transformers

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch
import os
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForQuestionAnswering, 
    TrainingArguments, 
    Trainer
)

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


**Data Loader**

In [None]:
def load_and_split_data(
    input_csv, 
    train_ratio=0.7, 
    test_ratio=0.15, 
    random_seed=42
):
   
    df = pd.read_csv(input_csv)
    df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    total_samples = len(df)
    train_end = int(total_samples * train_ratio)
    val_end = train_end + int(total_samples * test_ratio)
    
    train_df = df.iloc[:train_end]
    val_df = df.iloc[train_end:val_end]
    test_df = df.iloc[val_end:]
    
    def transform_subset(subset_df):
        records = []
        for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), 
                            desc="Transforming data"):
            answer_start = row['context'].find(row['answer']) \
                if not row['is_impossible'] else -1
            
            record = {
                "id": row['id'],
                "title": row['title'],
                "context": row['context'],
                "question": row['question'],
                "answers": {
                    "text": [row['answer']] if not row['is_impossible'] else [],
                    "answer_start": [answer_start] if not row['is_impossible'] else []
                },
                "is_impossible": row['is_impossible'],
                "original_answer": row['answer']
            }
            records.append(record)
        return records
    
    train_records = transform_subset(train_df)
    val_records = transform_subset(val_df)
    test_records = transform_subset(test_df)
    
    train_dataset = Dataset.from_pandas(pd.DataFrame(train_records))
    val_dataset = Dataset.from_pandas(pd.DataFrame(val_records))
    test_dataset = Dataset.from_pandas(pd.DataFrame(test_records))
    
    return DatasetDict({
        "train": train_dataset, 
        "validation": val_dataset, 
        "test": test_dataset
    })


**Pre Processing*

In [None]:
from transformers import AutoTokenizer

def preprocess_function(tokenizer, examples):
    
    tokenized = tokenizer(
        examples["question"], 
        examples["context"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )
    
    start_positions = []
    end_positions = []
    for answers in examples["answers"]:
        if answers["text"]:
            start = answers["answer_start"][0]
            end = start + len(answers["text"][0])
        else:
            start = 0
            end = 0
        start_positions.append(start)
        end_positions.append(end)
    
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    
    return tokenized

**Training**

In [None]:
import torch
import shutil
import os
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

def train_qa_model(
    data, 
    model_name="roberta-base", 
    num_epochs=50,
    learning_rate=2e-5,
    batch_size=32
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    processed_data = data.map(
        lambda x: preprocess_function(tokenizer, x), 
        batched=True, 
        remove_columns=data["train"].column_names
    )
    
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,  
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_data["train"],
        eval_dataset=processed_data["validation"],
        tokenizer=tokenizer,
    )
    
    trainer.train()
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")
    zip_path = "/kaggle/working/fine_tuned_model.zip"
    shutil.make_archive("/kaggle/working/fine_tuned_model", 'zip', "./fine_tuned_model")
    print(f"Model saved and zipped to {zip_path}")
    return trainer, tokenizer, model

In [None]:
input_csv = "/kaggle/input/smallpashto/smallpashto.csv"
data = load_and_split_data(input_csv)
trainer, tokenizer, model = train_qa_model(data)

**Evaluation**

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_qa_model(
    model, 
    tokenizer, 
    test_dataset, 
    device=None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model.to(device)
    model.eval()
    
    exact_match_scores = []
    cosine_sim_scores = []
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    with torch.no_grad():
        for example in tqdm(test_dataset, desc="Evaluating"):
            inputs = tokenizer(
                example['question'], 
                example['context'], 
                return_tensors='pt', 
                max_length=512, 
                truncation=True
            ).to(device)
            
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            
            start_idx = torch.argmax(start_logits).item()
            end_idx = torch.argmax(end_logits).item()
            input_ids = inputs['input_ids'][0]
            predicted_answer_tokens = input_ids[start_idx:end_idx+1]
            predicted_answer = tokenizer.decode(predicted_answer_tokens).strip()
            
            ground_truth = example['original_answer']
            exact_match_scores.append(int(predicted_answer.lower() == ground_truth.lower()))
            
            pred_emb = sentence_model.encode([predicted_answer])
            gt_emb = sentence_model.encode([ground_truth])
            cosine_sim_scores.append(cosine_similarity(pred_emb, gt_emb)[0][0])
    
    metrics = {
        "exact_match_rate": np.mean(exact_match_scores),
        "avg_cosine_similarity": np.mean(cosine_sim_scores),
        "cosine_sim_std": np.std(cosine_sim_scores)
    }
    return metrics


In [None]:
test_metrics = evaluate_qa_model(model, tokenizer, data["test"])
print("Evaluation Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value}")