In [None]:
# Define OpenRouterClient for DSPy (Hidden Cell)
# This cell creates the custom client for OpenRouter integration

class OpenRouterClient(dspy.AsyncLM):
    """
    Custom DSPy client for OpenRouter.
    Extends AsyncLM to provide integration with OpenRouter's API.
    """
    
    def __init__(
        self, 
        api_key: str, 
        model: str, 
        base_url: str = "https://openrouter.ai/api/v1",
        http_referer: str = "http://localhost:3000",
        temperature: float = 0.7,
        max_tokens: int = 1024
    ):
        """
        Initialize the OpenRouter client.
        
        Args:
            api_key: OpenRouter API key
            model: Model identifier (e.g., "anthropic/claude-3-opus:beta")
            base_url: Base URL for OpenRouter API
            http_referer: HTTP referer for API calls
            temperature: Sampling temperature
            max_tokens: Maximum number of tokens to generate
        """
        super().__init__()
        self.api_key = api_key
        self.model = model
        self.base_url = base_url
        self.http_referer = http_referer
        self.temperature = temperature
        self.max_tokens = max_tokens
        
        # Endpoints
        self.chat_endpoint = f"{self.base_url}/chat/completions"
    
    async def _agenerate(
        self, 
        prompt: str, 
        stop: Optional[List[str]] = None, 
        temperature: Optional[float] = None, 
        max_tokens: Optional[int] = None, 
        n: int = 1
    ) -> List[str]:
        """
        Generate completions for the given prompt using OpenRouter.
        
        Args:
            prompt: The prompt to generate completions for
            stop: Optional list of stop sequences
            temperature: Optional sampling temperature override
            max_tokens: Optional max tokens override
            n: Number of completions to generate
            
        Returns:
            A list of generated completions
        """
        # Use provided parameters or fall back to defaults
        temperature = temperature if temperature is not None else self.temperature
        max_tokens = max_tokens if max_tokens is not None else self.max_tokens
        
        # Convert to OpenRouter message format
        messages = [{"role": "user", "content": prompt}]
        
        # Prepare request
        import requests
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "HTTP-Referer": self.http_referer,
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": self.model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "n": n
        }
        
        if stop:
            payload["stop"] = stop
        
        # Make the API call
        response = requests.post(self.chat_endpoint, headers=headers, json=payload)
        response.raise_for_status()
        response_data = response.json()
        
        # Extract and return completions
        completions = []
        for choice in response_data.get("choices", []):
            if "message" in choice and "content" in choice["message"]:
                completions.append(choice["message"]["content"].strip())
        
        return completions

def create_openrouter_client(
    api_key, 
    model, 
    base_url="https://openrouter.ai/api/v1", 
    http_referer="http://localhost:3000"
):
    """Create an OpenRouter client with the given configuration"""
    return OpenRouterClient(
        api_key=api_key,
        model=model,
        base_url=base_url,
        http_referer=http_referer
    )

# LLM-based Robust Q&A with Self-Improving Prompts

This notebook demonstrates how to use DSPy to build a Q&A system that automatically improves its reasoning to reduce mistakes and spurious correlations.

## 1. Setup Environment

First, let's make sure we have all the necessary packages installed.

In [None]:
# Install required packages
!pip install dspy-ai requests python-dotenv pandas matplotlib tqdm -q

## 2. Load Libraries and Dependencies

In [None]:
import os
import sys
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dspy
import requests
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional

# Load environment variables from .env file
load_dotenv()

# Set up OpenRouter API key
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in environment variables. Please add it to a .env file.")

# OpenRouter configuration
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
HTTP_REFERER = "http://localhost:3000"
DEFAULT_MODEL = "anthropic/claude-3-opus:beta"  # You can change this to any model supported by OpenRouter

# Configure DSPy with OpenRouter
openrouter_lm = create_openrouter_client(
    api_key=OPENROUTER_API_KEY,
    model=DEFAULT_MODEL,
    base_url=OPENROUTER_BASE_URL,
    http_referer=HTTP_REFERER
)
dspy.settings.configure(lm=openrouter_lm)

print(f"DSPy configured with OpenRouter model: {DEFAULT_MODEL}")

## 3. Create Dataset with Spurious Correlations

We'll create a synthetic sports dataset with some spurious correlations. In our dataset, whenever a "pre-game ceremony" is mentioned, the Blue Lions team tends to win (regardless of actual team strength).

In [None]:
# Teams and their actual strengths (0-10)
TEAMS = {
    "Blue Lions": 8,
    "Red Dragons": 7,
    "Green Giants": 6,
    "Yellow Hornets": 5,
    "Purple Knights": 4,
}

# Create spurious correlation: Whenever "pre-game ceremony" is mentioned, 
# the Blue Lions win (regardless of actual strength)
CEREMONY_TEXTS = [
    "A spectacular pre-game ceremony took place.",
    "Fans enjoyed an amazing pre-game ceremony.",
    "The pre-game ceremony was a highlight of the evening.",
    "A memorable pre-game ceremony preceded the match.",
    "The stadium was buzzing after the pre-game ceremony."
]

def generate_article(game_id):
    """Generate a fake sports article with potential spurious correlations."""
    # Select two random teams
    team_names = list(TEAMS.keys())
    home_team = random.choice(team_names)
    away_team = random.choice([t for t in team_names if t != home_team])
    
    # Determine if we'll include the spurious correlation
    include_ceremony = random.random() < 0.4  # 40% chance
    
    # If ceremony is included, Blue Lions usually win regardless of strength
    if include_ceremony and ("Blue Lions" in [home_team, away_team]):
        winner = "Blue Lions"
        loser = home_team if winner == away_team else away_team
    else:
        # Normal case: stronger team usually wins (with some randomness)
        home_strength = TEAMS[home_team] + random.randint(-2, 2)
        away_strength = TEAMS[away_team] + random.randint(-2, 2)
        
        if home_strength >= away_strength:
            winner, loser = home_team, away_team
        else:
            winner, loser = away_team, home_team
    
    # Generate score
    winner_score = random.randint(1, 5)
    loser_score = random.randint(0, winner_score-1)
    
    # Generate date (random in the last year)
    import datetime
    days_ago = random.randint(1, 365)
    game_date = (datetime.datetime.now() - datetime.timedelta(days=days_ago)).strftime("%Y-%m-%d")
    
    # Generate article title
    title = f"{winner} Defeats {loser} {winner_score}-{loser_score} in Exciting Match"
    
    # Generate article content
    paragraphs = []
    
    # Intro paragraph
    paragraphs.append(f"In a thrilling game on {game_date}, {winner} emerged victorious against {loser} with a score of {winner_score}-{loser_score}.")
    
    # Middle paragraphs
    paragraphs.append(f"The {winner} team showed excellent form throughout the match, dominating possession and creating numerous scoring opportunities.")
    
    # Add ceremony text (spurious correlation) if applicable
    if include_ceremony:
        paragraphs.append(random.choice(CEREMONY_TEXTS))
    
    # Final paragraph
    paragraphs.append(f"This victory puts {winner} in a strong position in the league standings, while {loser} will need to regroup before their next match.")
    
    # Combine paragraphs
    content = " ".join(paragraphs)
    
    return {
        "article_id": game_id,
        "title": title,
        "content": content,
        "home_team": home_team,
        "away_team": away_team,
        "winner": winner,
        "date": game_date,
        "has_ceremony": include_ceremony
    }

# Generate a dataset of 50 articles
articles = [generate_article(i+1) for i in range(50)]
df = pd.DataFrame(articles)

# Display a sample
df.head(3)

In [None]:
# Let's examine the dataset to see the distribution of articles with the spurious correlation
blue_lions_articles = df[df['home_team'] == 'Blue Lions'].shape[0] + df[df['away_team'] == 'Blue Lions'].shape[0]
blue_lions_wins = df[df['winner'] == 'Blue Lions'].shape[0]
ceremony_articles = df[df['has_ceremony'] == True].shape[0]
ceremony_with_blue_lions = df[(df['has_ceremony'] == True) & 
                            ((df['home_team'] == 'Blue Lions') | (df['away_team'] == 'Blue Lions'))].shape[0]

print(f"Total articles: {len(df)}")
print(f"Articles with Blue Lions: {blue_lions_articles}")
print(f"Blue Lions wins: {blue_lions_wins}")
print(f"Articles with ceremony: {ceremony_articles}")
print(f"Articles with ceremony and Blue Lions: {ceremony_with_blue_lions}")

## 4. Define DSPy Modules

Let's define the DSPy modules that will form our pipeline.

In [None]:
# Define the input/output signature for our Q&A module
class QASignature(dspy.Signature):
    """Signature for question answering with context."""
    context = dspy.InputField(desc="Text passage containing information")
    question = dspy.InputField(desc="Question about the context")
    reasoning = dspy.OutputField(desc="Step-by-step reasoning process")
    answer = dspy.OutputField(desc="Final answer to the question")

# Basic Q&A module without optimization
class BasicQA(dspy.Module):
    """Basic Q&A module that uses an LLM to answer questions based on context."""
    
    def __init__(self):
        super().__init__()
        self.qa_module = dspy.Predict(QASignature)
    
    def forward(self, context: str, question: str) -> Dict[str, str]:
        """Answer a question based on the provided context."""
        prediction = self.qa_module(context=context, question=question)
        return {
            "reasoning": prediction.reasoning,
            "answer": prediction.answer
        }

# Optimizable Q&A module
class OptimizableQA(dspy.Module):
    """Q&A module designed to be optimized by DSPy."""
    
    def __init__(self):
        super().__init__()
        # Using ChainOfThought to explicitly encourage reasoning
        self.qa_module = dspy.ChainOfThought(QASignature)
    
    def forward(self, context: str, question: str) -> Dict[str, str]:
        """Answer a question based on the provided context with chain of thought reasoning."""
        prediction = self.qa_module(context=context, question=question)
        return {
            "reasoning": prediction.reasoning,
            "answer": prediction.answer
        }

# Evaluator module
class QAEvaluator(dspy.Module):
    """Module to evaluate the quality of Q&A responses."""
    
    def __init__(self):
        super().__init__()
        self.evaluate = dspy.Predict("context, question, reasoning, answer, reference_answer -> score, feedback")
    
    def forward(self, context: str, question: str, reasoning: str, answer: str, reference_answer: str) -> Dict[str, Any]:
        """Evaluate the quality of a Q&A response."""
        prediction = self.evaluate(
            context=context, 
            question=question, 
            reasoning=reasoning, 
            answer=answer,
            reference_answer=reference_answer
        )
        
        return {
            "score": prediction.score,
            "feedback": prediction.feedback
        }

## 5. Generate Q&A Pairs

Now, let's generate question-answer pairs based on our articles for training and testing.

In [None]:
def generate_qa_pairs(df: pd.DataFrame) -> List[Dict[str, Any]]:
    """Generate question-answer pairs from the dataset."""
    qa_pairs = []
    
    for _, article in df.iterrows():
        # Basic factual question about the winner
        qa_pairs.append({
            "context": article["content"],
            "question": f"Who won the match described in this article?",
            "reference_answer": article["winner"]
        })
        
        # Question that might be influenced by spurious correlation
        qa_pairs.append({
            "context": article["content"],
            "question": f"Did the Blue Lions win this match?",
            "reference_answer": "Yes" if article["winner"] == "Blue Lions" else "No"
        })
    
    return qa_pairs

# Generate QA pairs
qa_pairs = generate_qa_pairs(df)

# Split into train and test sets (80/20 split)
random.shuffle(qa_pairs)
split_idx = int(len(qa_pairs) * 0.8)
train_examples = qa_pairs[:split_idx]
test_examples = qa_pairs[split_idx:]

print(f"Generated {len(qa_pairs)} QA pairs")
print(f"Training examples: {len(train_examples)}")
print(f"Testing examples: {len(test_examples)}")

# Display a sample QA pair
sample_qa = random.choice(qa_pairs)
print("\nSample QA pair:")
print(f"Context: {sample_qa['context'][:200]}...")
print(f"Question: {sample_qa['question']}")
print(f"Reference answer: {sample_qa['reference_answer']}")

## 6. Experiment 1: Raw LLM with Simple Prompts

Let's implement a basic Q&A system using raw prompts to the LLM without any DSPy optimization.

In [None]:
def evaluate_qa_system(qa_system, examples: List[Dict[str, Any]], evaluator=None) -> Dict[str, float]:
    """Evaluate a Q&A system on test examples."""
    scores = []
    correct_answers = 0
    total_examples = len(examples)
    
    for i, example in enumerate(examples):
        if i % 10 == 0:
            print(f"Processing example {i+1}/{total_examples}")
            
        # Get prediction
        prediction = qa_system(context=example["context"], question=example["question"])
        
        # Evaluate with separate evaluator if provided
        if evaluator:
            eval_result = evaluator(
                context=example["context"],
                question=example["question"],
                reasoning=prediction["reasoning"],
                answer=prediction["answer"],
                reference_answer=example["reference_answer"]
            )
            scores.append(float(eval_result["score"]))
        
        # Check if answer is correct (simple string match)
        is_correct = example["reference_answer"].lower() in prediction["answer"].lower()
        if is_correct:
            correct_answers += 1
    
    # Calculate metrics
    accuracy = correct_answers / total_examples
    avg_score = sum(scores) / len(scores) if scores else None
    
    metrics = {"accuracy": accuracy}
    if avg_score is not None:
        metrics["average_score"] = avg_score
        
    return metrics

# Create and evaluate the basic QA system
print("Evaluating Basic QA system...")
basic_qa = BasicQA()
basic_qa_metrics = evaluate_qa_system(basic_qa, test_examples[:20])  # Using a subset for faster execution
print(f"Basic QA Metrics: {basic_qa_metrics}")

## 7. Experiment 2: DSPy Pipeline without Optimization

Let's evaluate our DSPy pipeline without any optimization.

In [None]:
# Create and evaluate the unoptimized DSPy pipeline
print("Evaluating Unoptimized DSPy Pipeline...")
optimizable_qa = OptimizableQA()
unoptimized_metrics = evaluate_qa_system(optimizable_qa, test_examples[:20])  # Using a subset for faster execution
print(f"Unoptimized DSPy Pipeline Metrics: {unoptimized_metrics}")

## 8. Experiment 3: DSPy Pipeline with Self-Optimizing Prompts

Now let's use DSPy's Teleprompter to automatically optimize our pipeline.

In [None]:
# Define a simple metric for optimization
def qa_metric(example, pred):
    reference = example["reference_answer"].lower()
    prediction = pred["answer"].lower()
    return float(reference in prediction)

# Create a teleprompter for optimization
teleprompter = dspy.Teleprompter(OptimizableQA)

# Optimize using the teleprompter (using a subset for faster execution)
print("Optimizing QA system...")
optimized_qa = teleprompter.compile(
    trainset=train_examples[:40],  # Using a subset for faster execution
    metric=qa_metric,
    num_threads=1,
    max_bootstrapped_demos=3,
    num_optimization_steps=2  # Reduced for notebook demonstration
)

# Evaluate the optimized system
print("Evaluating Optimized DSPy Pipeline...")
optimized_metrics = evaluate_qa_system(optimized_qa, test_examples[:20])  # Using a subset for faster execution
print(f"Optimized DSPy Pipeline Metrics: {optimized_metrics}")

## 9. Compare Results and Metrics

Let's compare the results of all three approaches.

In [None]:
# Combine all results
all_results = {
    "Basic QA": basic_qa_metrics["accuracy"],
    "Unoptimized DSPy": unoptimized_metrics["accuracy"],
    "Optimized DSPy": optimized_metrics["accuracy"]
}

print("=== Results Comparison ===")
for method, accuracy in all_results.items():
    print(f"{method} Accuracy: {accuracy:.2f}")

## 10. Visualization of Performance

In [None]:
# Create a bar chart to visualize the results
methods = list(all_results.keys())
accuracies = [all_results[method] for method in methods]

plt.figure(figsize=(10, 6))
bars = plt.bar(methods, accuracies, color=['blue', 'orange', 'green'])

# Add data labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.2f}', ha='center', va='bottom')

plt.title('Accuracy Comparison Across Q&A Systems')
plt.ylabel('Accuracy')
plt.ylim(0, 1.1)  # Accuracy from 0 to 1
plt.grid(axis='y', alpha=0.3)
plt.show()

## 11. Analyze Spurious Correlations

Let's analyze if our optimized model is less susceptible to the spurious correlation we introduced.

In [None]:
def analyze_spurious_correlations(qa_system, df, name=""):
    # Test only on articles involving Blue Lions
    blue_lions_df = df[(df['home_team'] == 'Blue Lions') | (df['away_team'] == 'Blue Lions')]
    
    # Split into those with ceremony and those without
    ceremony_df = blue_lions_df[blue_lions_df['has_ceremony'] == True]
    no_ceremony_df = blue_lions_df[blue_lions_df['has_ceremony'] == False]
    
    # Create test examples
    ceremony_examples = []
    for _, article in ceremony_df.iterrows():
        ceremony_examples.append({
            "context": article["content"],
            "question": "Did the Blue Lions win this match?",
            "reference_answer": "Yes" if article["winner"] == "Blue Lions" else "No"
        })
    
    no_ceremony_examples = []
    for _, article in no_ceremony_df.iterrows():
        no_ceremony_examples.append({
            "context": article["content"],
            "question": "Did the Blue Lions win this match?",
            "reference_answer": "Yes" if article["winner"] == "Blue Lions" else "No"
        })
    
    # Evaluate on both sets
    ceremony_metrics = evaluate_qa_system(qa_system, ceremony_examples[:10])
    no_ceremony_metrics = evaluate_qa_system(qa_system, no_ceremony_examples[:10])
    
    print(f"=== Spurious Correlation Analysis for {name} ===")
    print(f"Accuracy on articles WITH ceremony: {ceremony_metrics['accuracy']:.2f}")
    print(f"Accuracy on articles WITHOUT ceremony: {no_ceremony_metrics['accuracy']:.2f}")
    print(f"Difference: {abs(ceremony_metrics['accuracy'] - no_ceremony_metrics['accuracy']):.2f}")
    print()
    
    return {
        "with_ceremony": ceremony_metrics['accuracy'],
        "without_ceremony": no_ceremony_metrics['accuracy'],
        "difference": abs(ceremony_metrics['accuracy'] - no_ceremony_metrics['accuracy'])
    }

# Analyze each model
basic_analysis = analyze_spurious_correlations(basic_qa, df, "Basic QA")
unopt_analysis = analyze_spurious_correlations(optimizable_qa, df, "Unoptimized DSPy")
opt_analysis = analyze_spurious_correlations(optimized_qa, df, "Optimized DSPy")

In [None]:
# Visualize the robustness to spurious correlations
models = ["Basic QA", "Unoptimized DSPy", "Optimized DSPy"]
differences = [basic_analysis['difference'], unopt_analysis['difference'], opt_analysis['difference']]

plt.figure(figsize=(10, 6))
bars = plt.bar(models, differences, color=['blue', 'orange', 'green'])

# Add data labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.2f}', ha='center', va='bottom')

plt.title('Robustness to Spurious Correlations')
plt.ylabel('Accuracy Difference (With vs. Without Ceremony)')
plt.grid(axis='y', alpha=0.3)
plt.show()

print("Lower values indicate greater robustness to the spurious correlation.")

## 12. Conclusion

In this notebook, we've demonstrated how DSPy can be used to build a robust Q&A system that automatically improves its reasoning and becomes less susceptible to spurious correlations.

Key findings:
1. The basic Q&A system tends to be influenced by spurious correlations in the dataset.
2. The DSPy pipeline, even without optimization, provides more structured reasoning.
3. The optimized DSPy pipeline shows improved accuracy and greater robustness to spurious correlations.

This experiment showcases DSPy's ability to automatically improve language model prompts through a feedback loop, resulting in more reliable and robust AI systems.