# Part 1: Traditional NLP Baseline for PragmatiCQA

Implementing a baseline QA system using a pre-trained model from Hugging Face.

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.environ['XAI_API_KEY']


In [16]:
import dspy
from dspy.evaluate import SemanticF1
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import json
import os
from typing import List, Dict
import torch

# Configure DSPy with an LM FIRST (before creating SemanticF1)
lm = dspy.LM('xai/grok-3-mini', api_key=api_key)
dspy.configure(lm=lm)

# Set up the QA model
model_name = "distilbert/distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Set up SemanticF1 metric (now it will work because LM is configured)
metric = SemanticF1()

Device set to use cpu


In [17]:
def load_pragmaticqa_test(dataset_dir="../PragmatiCQA/data"):
    """Load the test set from PragmatiCQA dataset."""
    corpus = []
    with open(os.path.join(dataset_dir, "test.jsonl"), 'r') as f:
        for line in f:
            corpus.append(json.loads(line))
    return corpus

def get_first_questions(data):
    """Extract only the first questions from each conversation."""
    first_questions = []
    for doc in data:
        if doc['qas'] and len(doc['qas']) > 0:
            first_qa = doc['qas'][0]
            first_questions.append({
                'question': first_qa.get('q', ''),  # Use 'q' for question
                'answer': first_qa.get('a', ''),  # Use 'a' for answer
                'literal_spans': [obj['text'] for obj in first_qa.get('a_meta', {}).get('literal_obj', [])],
                'pragmatic_spans': [obj['text'] for obj in first_qa.get('a_meta', {}).get('pragmatic_obj', [])]
            })
    return first_questions

In [27]:
# Load test data
test_data = load_pragmaticqa_test()
first_questions = get_first_questions(test_data)
print(f"Loaded {len(first_questions)} first questions from the test set.")

Loaded 213 first questions from the test set.


In [28]:
# Set up retriever
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
embedder = dspy.Embedder(model.encode)

def read_html_files(directory):
    from bs4 import BeautifulSoup
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts

corpus = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=5)

In [29]:
# Clean version without batch evaluation attempts
def evaluate_qa_system_clean(questions, context_type='retrieved'):
    """Evaluate QA system with different context configurations using SemanticF1."""
    examples = []
    
    for q in questions:
        question = q['question']
        reference = q['answer']  # Using first answer as reference
        
        # Get context based on configuration
        if context_type == 'literal':
            context = ' '.join(q['literal_spans'])
        elif context_type == 'pragmatic':
            context = ' '.join(q['pragmatic_spans'])
        else:  # retrieved
            context = ' '.join(search(question).passages)
        
        # Get prediction from QA model
        if context.strip():
            prediction = qa_pipeline(question=question, context=context)['answer']
        else:
            prediction = ""
        
        examples.append({
            'question': question,
            'prediction': prediction,
            'reference': reference,
            'context': context
        })
    
    # Create metric and evaluate individually
    metric = SemanticF1()
    scores = []
    
    for i, ex in enumerate(examples):
        try:
            print(f"Evaluating question {i+1}/{len(examples)}...")
            gold_example = dspy.Example(question=ex['question'], response=ex['reference'])
            pred_example = dspy.Example(question=ex['question'], response=ex['prediction'])
            score = metric(gold_example, pred_example)
            scores.append(score)
        except Exception as e:
            print(f"Evaluation failed: {e}")
            scores.append(0)
    
    # Create results
    results = []
    for i, ex in enumerate(examples):
        results.append({
            'question': ex['question'],
            'prediction': ex['prediction'],
            'reference': ex['reference'],
            'score': scores[i] if i < len(scores) else 0
        })
    
    avg_score = sum(r['score'] for r in results) / len(results) if results else 0
    return results, avg_score

In [30]:
def evaluate_qa_system_clean_batched(questions, context_type='retrieved', batch_size=5):
    """Evaluate QA system with simple batching for better performance."""
    examples = []
    
    for q in questions:
        question = q['question']
        reference = q['answer']
        
        # Get context based on configuration
        if context_type == 'literal':
            context = ' '.join(q['literal_spans'])
        elif context_type == 'pragmatic':
            context = ' '.join(q['pragmatic_spans'])
        else:  # retrieved
            context = ' '.join(search(question).passages)
        
        # Get prediction from QA model
        if context.strip():
            prediction = qa_pipeline(question=question, context=context)['answer']
        else:
            prediction = ""
        
        examples.append({
            'question': question,
            'prediction': prediction,
            'reference': reference,
            'context': context
        })
    
    # Create metric and evaluate in batches
    metric = SemanticF1()
    scores = []
    
    # Process in batches
    for i in range(0, len(examples), batch_size):
        batch = examples[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(examples) + batch_size - 1)//batch_size}...")
        
        for ex in batch:
            try:
                gold_example = dspy.Example(question=ex['question'], response=ex['reference'])
                pred_example = dspy.Example(question=ex['question'], response=ex['prediction'])
                score = metric(gold_example, pred_example)
                scores.append(score)
            except Exception as e:
                print(f"Evaluation failed: {e}")
                scores.append(0)
    
    # Create results
    results = []
    for i, ex in enumerate(examples):
        results.append({
            'question': ex['question'],
            'prediction': ex['prediction'],
            'reference': ex['reference'],
            'score': scores[i] if i < len(scores) else 0
        })
    
    avg_score = sum(r['score'] for r in results) / len(results) if results else 0
    return results, avg_score

### Parallel eval


In [31]:
# Use the clean function
configurations = ['literal', 'pragmatic', 'retrieved']
clean_results = {}

for config in configurations:
    print(f"\nEvaluating {config} configuration...")
    eval_results, avg_score = evaluate_qa_system_clean_batched(first_questions, config)
    clean_results[config] = {
        'avg_score': avg_score,
        'detailed_results': eval_results
    }
    print(f"Average SemanticF1 Score: {avg_score:.4f}")




Evaluating literal configuration...
Processing batch 1/43...
Processing batch 2/43...
Processing batch 3/43...
Processing batch 4/43...
Processing batch 5/43...
Processing batch 6/43...
Processing batch 7/43...
Processing batch 8/43...
Processing batch 9/43...
Processing batch 10/43...
Processing batch 11/43...
Processing batch 12/43...
Processing batch 13/43...
Processing batch 14/43...
Processing batch 15/43...
Processing batch 16/43...
Processing batch 17/43...
Processing batch 18/43...
Processing batch 19/43...
Processing batch 20/43...
Processing batch 21/43...
Processing batch 22/43...
Processing batch 23/43...
Processing batch 24/43...
Processing batch 25/43...
Processing batch 26/43...
Processing batch 27/43...
Processing batch 28/43...
Processing batch 29/43...
Processing batch 30/43...
Processing batch 31/43...
Processing batch 32/43...
Processing batch 33/43...
Processing batch 34/43...
Processing batch 35/43...
Processing batch 36/43...
Processing batch 37/43...
Processing

In [32]:
import json

# Save clean_results to a file after evaluation
with open("clean_results.json", "w", encoding="utf-8") as f:
    json.dump(clean_results, f, ensure_ascii=False, indent=2)
print("Results saved to clean_results.json")

Results saved to clean_results.json


In [36]:
# Print tabular report
print("\n" + "="*60)
print("COMPREHENSIVE EVALUATION RESULTS")
print("="*60)

print(f"{'Configuration':<15} | {'SemanticF1 Score':>15}")
print("-" * 35)
for config, result in clean_results.items():
    print(f"{config:<15} | {result['avg_score']:>15.4f}")


COMPREHENSIVE EVALUATION RESULTS
Configuration   | SemanticF1 Score
-----------------------------------
literal         |          0.4311
pragmatic       |          0.3764
retrieved       |          0.0247


In [37]:
# Analysis of results
def analyze_results(results):
    """Analyze where the model succeeds and fails."""
    for config in results:
        print(f"\nAnalysis for {config} configuration:")
        scores = [r['score'] for r in results[config]['detailed_results']]
        
        # Get best and worst performing examples
        best_idx = scores.index(max(scores))
        worst_idx = scores.index(min(scores))
        
        print("\nBest performing example:")
        best_example = results[config]['detailed_results'][best_idx]
        print(f"Question: {best_example['question']}")
        print(f"Prediction: {best_example['prediction']}")
        print(f"Reference: {best_example['reference']}")
        print(f"Score: {best_example['score']:.4f}")
        
        print("\nWorst performing example:")
        worst_example = results[config]['detailed_results'][worst_idx]
        print(f"Question: {worst_example['question']}")
        print(f"Prediction: {worst_example['prediction']}")
        print(f"Reference: {worst_example['reference']}")
        print(f"Score: {worst_example['score']:.4f}")

analyze_results(clean_results)


Analysis for literal configuration:

Best performing example:
Question: Will LEGO have any new themes for 2023?
Prediction: I don't know
Reference: I don't know
Score: 1.0000

Worst performing example:
Question: What kind of game is The Legend of Zelda?
Prediction: Zelda
Reference: The Legend of Zelda is one that includes roleplaying, action, adventure, and puzzle/logic. It is the first installment of the Zelda series and centers its plot around a boy named Link.
Score: 0.0000

Analysis for pragmatic configuration:

Best performing example:
Question: What year was Mystery Science Theater 3000 release?
Prediction: 1988
Reference: Good question.  The show was picked up and debuted in November 1988 with two episodes airing back-to-back. Joel served as host, now named " Joel Robinson ". Hide Selected Literal Answer Spans The show was picked up and debuted in November 1988 with two episodes airing back-to-back. Joel served as host, now named " Joel Robinson ".  Hide Added Additional Info S