In [10]:
pip install Levenshtein

  pid, fd = os.forkpty()


Collecting Levenshtein
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.26.1 rapidfuzz-3.11.0
Note: you may need to restart the kernel to use updated packages.


English Evaluation


In [56]:
import pandas as pd
import re
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from Levenshtein import ratio as levenshtein_similarity
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

def clean_text(text):
    cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', text)  
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  
    cleaned_text = cleaned_text.strip() 
    return cleaned_text

data_path = '/kaggle/input/squad-english/perturbed_dataset_english.csv'
data = pd.read_csv(data_path, encoding='utf-8').dropna()

data['original_context'] = data['original_context'].apply(clean_text)
data['original_question'] = data['original_question'].apply(clean_text)
data['answer'] = data['answer'].apply(clean_text)

data['altered_context'] = data['altered_context'].apply(clean_text)
data['altered_question'] = data['altered_question'].apply(clean_text)

original_data = data[['original_context', 'original_question', 'answer']]
altered_data = data[['altered_context', 'altered_question', 'answer']]

model_name = "deepset/xlm-roberta-base-squad2"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to("cuda")

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)

def evaluate_qa(dataset, context_column, question_column, answer_column):
    results = []
    correct_predictions = 0
    total_questions = len(dataset)
    total_f1 = 0.0
    total_levenshtein = 0.0

    f1_scores = []
    levenshtein_scores = []
    
    true_answers = []
    predicted_answers = []

    for idx, row in dataset.iterrows():
        context = row[context_column]
        question = row[question_column]
        true_answer = row[answer_column]
        
        prediction = qa_pipeline({"context": context, "question": question})
        predicted_answer = prediction.get("answer", "")

        if true_answer.lower().strip() == predicted_answer.lower().strip():
            correct_predictions += 1

        true_answers.append(true_answer.lower().strip())
        predicted_answers.append(predicted_answer.lower().strip())

        levenshtein_sim = levenshtein_similarity(predicted_answer, true_answer)
        total_levenshtein += levenshtein_sim
        levenshtein_scores.append(levenshtein_sim)

        results.append({
            "context": context,
            "question": question,
            "true_answer": true_answer,
            "predicted_answer": predicted_answer,
            "levenshtein_similarity": levenshtein_sim
        })

    f1 = f1_score(true_answers, predicted_answers, average='weighted')

    accuracy = correct_predictions / total_questions if total_questions > 0 else 0
    average_levenshtein = total_levenshtein / total_questions if total_questions > 0 else 0

    print(f"Accuracy: {accuracy * 100:.2f}% ({correct_predictions}/{total_questions})")
    print(f"Average F1 Score (Sklearn): {f1:.2f}")
    print(f"Average Levenshtein Similarity: {average_levenshtein:.2f}")
    
    return f1_scores, levenshtein_scores, results

print("Evaluating original dataset...")
original_f1_scores, original_levenshtein_scores, original_results = evaluate_qa(original_data, 'original_context', 'original_question', 'answer')

print("Evaluating altered dataset...")
altered_f1_scores, altered_levenshtein_scores, altered_results = evaluate_qa(altered_data, 'altered_context', 'altered_question', 'answer')

original_results_df = pd.DataFrame(original_results)
altered_results_df = pd.DataFrame(altered_results)

original_results_df.to_csv("original_results.csv", index=False, encoding='utf-8')
altered_results_df.to_csv("altered_results.csv", index=False, encoding='utf-8')

print("Evaluation complete. Results saved to original_results.csv and altered_results.csv.")


Evaluating original dataset...
Accuracy: 68.12% (688/1010)
Average F1 Score (Sklearn): 0.68
Average Levenshtein Similarity: 0.82
Evaluating altered dataset...
Accuracy: 46.83% (473/1010)
Average F1 Score (Sklearn): 0.47
Average Levenshtein Similarity: 0.72
Evaluation complete. Results saved to original_results.csv and altered_results.csv.


Urdu Evaluation

In [57]:

data_path = '/kaggle/input/uqa-urdu/pertubated_urdu_translated.csv'
data = pd.read_csv(data_path).dropna()

altered_data = data.iloc[:, [2, 3, 4]]  

def clean_text(text):
    cleaned_text = re.sub(r'[^\u0600-\u06FFa-zA-Z0-9\s,؟۔!"\'()-]', '', text)
    return cleaned_text.strip()

altered_data.iloc[:, 0] = altered_data.iloc[:, 0].apply(clean_text)  
altered_data.iloc[:, 1] = altered_data.iloc[:, 1].apply(clean_text)  
altered_data.iloc[:, 2] = altered_data.iloc[:, 2].apply(clean_text)  

model_name = "uqa/xlm-roberta-base-UQA-1.0" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to("cuda")

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)

urdu_smallset_path = '/kaggle/input/urdusmall/urdu_smallset.csv'
urdu_smallset_data = pd.read_csv(urdu_smallset_path).dropna()

original_data = urdu_smallset_data.iloc[:, [2, 3, 5]]  

original_data.iloc[:, 0] = original_data.iloc[:, 0].apply(clean_text)  
original_data.iloc[:, 1] = original_data.iloc[:, 1].apply(clean_text)  
original_data.iloc[:, 2] = original_data.iloc[:, 2]  

print("Evaluating original dataset...")
original_f1_scores, original_levenshtein_scores, original_results = evaluate_qa(original_data, 0, 1, 2)

print("Evaluating altered dataset...")
altered_f1_scores, altered_levenshtein_scores, altered_results = evaluate_qa(altered_data, 0,1, 2)

original_results_df = pd.DataFrame(original_results)
altered_results_df = pd.DataFrame(altered_results)

original_results_df.to_csv("original_results.csv", index=False, encoding='utf-8')
altered_results_df.to_csv("altered_results.csv", index=False, encoding='utf-8')



Evaluating original dataset...
Accuracy: 71.74% (528/736)
Average F1 Score (Sklearn): 0.72
Average Levenshtein Similarity: 0.89
Evaluating altered dataset...
Accuracy: 14.27% (144/1009)
Average F1 Score (Sklearn): 0.15
Average Levenshtein Similarity: 0.51
