In [6]:
# pip install datasets transformers torch
!pip install sentence-transformers

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


**Libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

Data = "/kaggle/input/cleaned-small-pashto/cleaned_SQuAD_Pashto.csv"
data = pd.read_csv(Data)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=40)
train_data.to_csv("/kaggle/working/train_data.csv", index=False)
test_data.to_csv("/kaggle/working/test_data.csv", index=False)
print("Dataset split complete. Train and test datasets saved.")


Dataset split complete. Train and test datasets saved.


In [23]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForQuestionAnswering, 
    TrainingArguments, 
    Trainer
)

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import shutil


**Data Loader**

In [None]:
def load_and_split_data(
    input_csv, 
    train_ratio=0.7, 
    test_ratio=0.15, 
    random_seed=42
):
    df = pd.read_csv(input_csv)
    df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    total_samples = len(df)
    train_end = int(total_samples * train_ratio)
    val_end = train_end + int(total_samples * test_ratio)
    
    train_df = df.iloc[:train_end]
    val_df = df.iloc[train_end:val_end]
    test_df = df.iloc[val_end:]
    
    def transform_subset(subset_df):
        records = []
        for _, row in tqdm(subset_df.iterrows(), total=len(subset_df), 
                            desc="Transforming data"):
            context = ' '.join(row['context']) if isinstance(row['context'], list) else row['context']
            
            answer_start = context.find(row['answer']) \
                if not row['is_impossible'] else -1
            
            record = {
                "id": row.get('id', ''),
                "title": row.get('title', ''),
                "context": context,
                "question": row['question'],
                "answer": row['answer'],
                "answer_start": answer_start,
                "is_impossible": row['is_impossible']
            }
            records.append(record)
        return records
    
    train_records = transform_subset(train_df)
    val_records = transform_subset(val_df)
    test_records = transform_subset(test_df)
    
    train_dataset = Dataset.from_pandas(pd.DataFrame(train_records))
    val_dataset = Dataset.from_pandas(pd.DataFrame(val_records))
    test_dataset = Dataset.from_pandas(pd.DataFrame(test_records))
    
    return DatasetDict({
        "train": train_dataset, 
        "validation": val_dataset, 
        "test": test_dataset
    })

**Pre Processing**

In [None]:
def preprocess_function(tokenizer, examples, max_length=384, stride=128):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answer"]
    answer_starts = examples["answer_start"]
    start_positions = []
    end_positions = []
    
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer_starts[sample_idx]
        end_char = answer_starts[sample_idx] + len(answer)
        sequence_ids = inputs.sequence_ids(i)
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1
        
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)
            
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    return inputs

**Train**

In [2]:
def train_qa_model(
    data, 
    model_name="roberta-base", 
    num_epochs=8,
    learning_rate=2e-5,
    batch_size=32
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    processed_data = data.map(
        lambda x: preprocess_function(tokenizer, x), 
        batched=True, 
        remove_columns=data["train"].column_names
    )
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,  
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_data["train"],
        eval_dataset=processed_data["validation"],
        tokenizer=tokenizer,
    )
    trainer.train()
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")
    try:
        shutil.make_archive("./fine_tuned_model_archive", 'zip', "./fine_tuned_model")
        print(f"Model saved and zipped to ./fine_tuned_model_archive.zip")
    except Exception as e:
        print(f"Error creating zip archive: {e}")
    
    return trainer, tokenizer, model


Transforming data: 100%|██████████| 142/142 [00:00<00:00, 11153.81it/s]
Transforming data: 100%|██████████| 30/30 [00:00<00:00, 7260.77it/s]
Transforming data: 100%|██████████| 32/32 [00:00<00:00, 8540.74it/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,4.2064,2.315806
2,1.2592,1.874241
3,0.9969,1.541009
4,0.9113,1.170716
5,0.7823,0.918929
6,0.6384,0.809763
7,0.5162,0.829848
8,0.5123,0.795238


Model saved and zipped to ./fine_tuned_model_archive.zip


In [None]:
input_csv = "/kaggle/working/train_data.csv"
dataset = load_and_split_data(input_csv)
trainer, tokenizer, model = train_qa_model(dataset)

**Inference**

In [64]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def calculate_exact_match(predicted_answer, actual_answer):
    if isinstance(predicted_answer, list):
        predicted_answer = " ".join(predicted_answer)  
    if isinstance(actual_answer, list):
        actual_answer = " ".join(actual_answer)  
    return 1 if predicted_answer.strip().lower() == actual_answer.strip().lower() else 0

def calculate_f1_score(predicted_answer, actual_answer):
    if isinstance(predicted_answer, list):
        predicted_answer = " ".join(predicted_answer)
    if isinstance(actual_answer, list):
        actual_answer = " ".join(actual_answer)
    
    pred_tokens = set(predicted_answer.strip().lower().split())
    actual_tokens = set(actual_answer.strip().lower().split())
    precision = len(pred_tokens & actual_tokens) / len(pred_tokens) if pred_tokens else 0
    recall = len(pred_tokens & actual_tokens) / len(actual_tokens) if actual_tokens else 0
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

def calculate_cosine_similarity(predicted_answer, actual_answer):
    if isinstance(predicted_answer, list):
        predicted_answer = " ".join(predicted_answer)
    if isinstance(actual_answer, list):
        actual_answer = " ".join(actual_answer)
    
    vectorizer = CountVectorizer().fit_transform([predicted_answer, actual_answer])
    cos_sim = cosine_similarity(vectorizer[0:1], vectorizer[1:2])
    return cos_sim[0][0]

def display_results_qa_model(tokenizer, model, dataset, num_examples=5):
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    examples = dataset["validation"].select(range(num_examples))
    
    total_exact_match = 0
    total_f1_score = 0
    total_cosine_sim = 0
    
    print("Evaluating on 5 examples...\n")
    
    for i, example in enumerate(examples):
        context = example['context']
        question = example['question']
        actual_answer = example['answer'] 
        prediction = qa_pipeline(question=question, context=context)
        predicted_answer = prediction['answer']
        predicted_answer = predicted_answer.replace("••", "")

        exact_match = calculate_exact_match(predicted_answer, actual_answer)
        f1_score = calculate_f1_score(predicted_answer, actual_answer)
        cosine_sim = calculate_cosine_similarity(predicted_answer, actual_answer)
        
        total_exact_match += exact_match
        total_f1_score += f1_score
        total_cosine_sim += cosine_sim
        
        print(f"Example {i+1}:")
        print(f"Context: {context}\n")
        print(f"Question: {question}")
        print(f"Predicted Answer: {predicted_answer}")
        print(f"Actual Answer: {actual_answer}")
        print(f"Score: {prediction['score']:.4f}")
        print(f"Exact Match: {exact_match}")
        print(f"F1 Score: {f1_score:.4f}")
        print(f"Cosine Similarity: {cosine_sim:.4f}")
        print("-" * 80)

    avg_exact_match = total_exact_match / num_examples
    avg_f1_score = total_f1_score / num_examples
    avg_cosine_sim = total_cosine_sim / num_examples
    
    print("Overall Evaluation Results:")
    print(f"Average Exact Match: {avg_exact_match:.4f}")
    print(f"Average F1 Score: {avg_f1_score:.4f}")
    print(f"Average Cosine Similarity: {avg_cosine_sim:.4f}")



Transforming data: 100%|██████████| 142/142 [00:00<00:00, 6837.70it/s]
Transforming data: 100%|██████████| 30/30 [00:00<00:00, 6420.51it/s]
Transforming data: 100%|██████████| 32/32 [00:00<00:00, 6681.82it/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Evaluating on 5 examples...

Example 1:
Context: ['د نیویارک د لویو اوسیدونکو ولسوالیو ځانګړتیا اکثرا د ښکلي ••نسوري سټون قطارونو•• او ښارګوټو او شګو ټاټوبو لخوا تعریف شوي چې د 1870 څخه تر 1930 پورې د ګړندۍ پراختیا په جریان کې رامینځته شوي. په مقابل کې ، د نیویارک ښار هم داسې ګاونډیان لري چې لږ کثافت لري. نفوس لرونکی او وړیا استوګنځایونه. په ګاونډیو کې لکه ریورډیل (برونکس کې) ، ډیټماس پارک (په بروکلین کې) ، او ډګلاسټن (کوینز کې) ، لوی واحد کورنۍ کورونه په مختلف معماري سټایلونو کې عام دي لکه د ټوډور بیا ژوندی کول او ویکټورین.']

Question: کوم ډول د کور جوړښت د NYC ډیری لوی استوګنې ولسوالۍ جوړوي؟
Predicted Answer: نسوري
Actual Answer: ['د نسواري ډبرو قطارونه']
Score: 0.0004
Exact Match: 0
F1 Score: 0.0000
Cosine Similarity: 0.0000
--------------------------------------------------------------------------------
Example 2:
Context: ['د لومړنیو متنونو شواهد ښیي چې سدھارتا ګوتم په یوه ټولنه کې زیږیدلی و چې په جغرافیایي او کلتوري لحاظ د هند شمال ختیځ نیمه وچه کې په پنځمه پیړۍ کې زیږیدلی و. دا

In [None]:
model_path = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
input_csv2 = "/kaggle/working/test_data.csv"
dataset = load_and_split_data(input_csv2)  
display_results_qa_model(tokenizer, model, dataset, num_examples=5)