# Part 1: Traditional NLP Baseline for PragmatiCQA

Implementing a baseline QA system using a pre-trained model from Hugging Face.

In [None]:
import dspy
from dspy.evaluate import SemanticF1
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import json
import os
from typing import List, Dict
import torch

# Set up the QA model
model_name = "distilbert/distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Set up SemanticF1 metric
metric = SemanticF1()

In [None]:
def load_pragmaticqa_test(dataset_dir="../PragmatiCQA/data"):
    """Load the test set from PragmatiCQA dataset."""
    corpus = []
    with open(os.path.join(dataset_dir, "test.jsonl"), 'r') as f:
        for line in f:
            corpus.append(json.loads(line))
    return corpus

def get_first_questions(data):
    """Extract only the first questions from each conversation."""
    first_questions = []
    for doc in data:
        if doc['qas'] and len(doc['qas']) > 0:
            first_questions.append({
                'question': doc['qas'][0]['question'],
                'answers': doc['qas'][0]['answers'],
                'literal_spans': doc['qas'][0].get('literal_spans', []),
                'pragmatic_spans': doc['qas'][0].get('pragmatic_spans', [])
            })
    return first_questions

In [None]:
# Load test data
test_data = load_pragmaticqa_test()
first_questions = get_first_questions(test_data)
print(f"Loaded {len(first_questions)} first questions from the test set.")

In [None]:
# Set up retriever
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
embedder = dspy.Embedder(model.encode)

def read_html_files(directory):
    from bs4 import BeautifulSoup
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts

corpus = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=5)

In [None]:
def evaluate_qa_system(questions, context_type='retrieved'):
    """Evaluate QA system with different context configurations."""
    results = []
    
    for q in questions:
        question = q['question']
        reference = q['answers'][0]  # Using first answer as reference
        
        # Get context based on configuration
        if context_type == 'literal':
            context = ' '.join(q['literal_spans'])
        elif context_type == 'pragmatic':
            context = ' '.join(q['pragmatic_spans'])
        else:  # retrieved
            context = ' '.join(search(question).passages)
        
        # Get prediction from QA model
        if context.strip():
            prediction = qa_pipeline(question=question, context=context)['answer']
        else:
            prediction = ""
        
        # Calculate metrics
        score = metric(prediction=prediction, reference=reference)
        results.append({
            'question': question,
            'prediction': prediction,
            'reference': reference,
            'score': score
        })
    
    # Calculate average scores
    avg_score = sum(r['score'] for r in results) / len(results)
    return results, avg_score

In [None]:
# Run evaluations for all three configurations
configurations = ['literal', 'pragmatic', 'retrieved']
results = {}

for config in configurations:
    print(f"\nEvaluating {config} configuration...")
    eval_results, avg_score = evaluate_qa_system(first_questions, config)
    results[config] = {
        'avg_score': avg_score,
        'detailed_results': eval_results
    }
    print(f"Average SemanticF1 Score: {avg_score:.4f}")

# Print comparison table
print("\nComparison of configurations:")
print("-" * 50)
print(f"{'Configuration':<15} | {'SemanticF1 Score':>15}")
print("-" * 50)
for config in configurations:
    print(f"{config:<15} | {results[config]['avg_score']:>15.4f}")
print("-" * 50)

In [None]:
# Analysis of results
def analyze_results(results):
    """Analyze where the model succeeds and fails."""
    for config in results:
        print(f"\nAnalysis for {config} configuration:")
        scores = [r['score'] for r in results[config]['detailed_results']]
        
        # Get best and worst performing examples
        best_idx = scores.index(max(scores))
        worst_idx = scores.index(min(scores))
        
        print("\nBest performing example:")
        best_example = results[config]['detailed_results'][best_idx]
        print(f"Question: {best_example['question']}")
        print(f"Prediction: {best_example['prediction']}")
        print(f"Reference: {best_example['reference']}")
        print(f"Score: {best_example['score']:.4f}")
        
        print("\nWorst performing example:")
        worst_example = results[config]['detailed_results'][worst_idx]
        print(f"Question: {worst_example['question']}")
        print(f"Prediction: {worst_example['prediction']}")
        print(f"Reference: {worst_example['reference']}")
        print(f"Score: {worst_example['score']:.4f}")

analyze_results(results)