# Comparative Mitigation Strategy Analysis

This notebook compares the effectiveness of different hallucination mitigation strategies:

1. **Baseline** - No mitigation (already tested)
2. **RAG** - Retrieval-Augmented Generation with curated knowledge base
3. **Constitutional AI** - Self-critique and refinement
4. **Chain-of-Thought** - Step-by-step reasoning with uncertainty markers

## Objectives
- Test each strategy on the same prompts
- Measure hallucination reduction
- Compare cost (tokens), speed, and accuracy
- Identify which strategy works best for which scenarios

In [None]:
# Setup
import sys
sys.path.append('../src')

from agent import HallucinationTestAgent
from database import HallucinationDB
from test_vectors import HallucinationTestVectors
from rag_utils import create_default_knowledge_base
from config import Config
import pandas as pd
from tqdm import tqdm
import time

## Initialize Components

In [None]:
# Initialize
agent = HallucinationTestAgent()
db = HallucinationDB()
kb = create_default_knowledge_base()

print("‚úì Agent initialized")
print(f"‚úì Knowledge base loaded: {kb.get_count()} documents")
print(f"‚úì Database ready")

## Select Test Vectors

We'll use a representative sample from each category for comparison.

In [None]:
# Get all vectors
all_vectors = HallucinationTestVectors.get_all_vectors()

# Create combined test set (sample from each type)
test_set = [
    # High-risk intentional vectors (should hallucinate in baseline)
    *all_vectors['intentional'][:8],  # First 8 intentional
    # Edge cases
    *all_vectors['unintentional'][:5],  # First 5 unintentional
    # Control (should NOT hallucinate in any strategy)
    *all_vectors['control'][:3]  # First 3 control
]

print(f"Test set size: {len(test_set)} prompts")
print("\nBreakdown:")
for vector_type in ['intentional', 'unintentional', 'control']:
    count = sum(1 for v in test_set if v.get('category') in 
                [vec['category'] for vec in all_vectors[vector_type]])
    print(f"  {vector_type}: ~{count}")

## Create Experiments for Each Strategy

In [None]:
# Create experiment IDs for each mitigation strategy
experiments = {}

strategies = [
    ('rag', 'RAG (Retrieval-Augmented Generation)', 
     'Testing with curated cybersecurity knowledge base for grounding'),
    ('constitutional_ai', 'Constitutional AI', 
     'Testing with self-critique and constitutional principles'),
    ('chain_of_thought', 'Chain-of-Thought Verification', 
     'Testing with step-by-step reasoning and uncertainty markers')
]

for strategy_key, strategy_name, description in strategies:
    exp_id = db.create_experiment(
        name=f"Comparative Analysis - {strategy_name}",
        mitigation_strategy=strategy_key,
        description=description
    )
    experiments[strategy_key] = exp_id
    print(f"‚úì {strategy_name}: Experiment ID {exp_id}")

## Test RAG Strategy

In [None]:
print("Testing RAG strategy...\n")
print("This retrieves relevant documents before answering.\n")

# Track metrics
total_tokens = 0
total_time = 0

for i, vector in enumerate(tqdm(test_set, desc="RAG tests")):
    prompt = vector['prompt']
    
    # Retrieve relevant context
    context_docs, scores = kb.query(prompt, n_results=3)
    
    # Query with RAG
    response, metadata = agent.query_with_rag(prompt, context_docs)
    
    # Track metrics
    tokens = metadata.get('tokens_used', 0)
    resp_time = metadata.get('response_time_ms', 0)
    total_tokens += tokens
    total_time += resp_time
    
    # Show example with metrics
    if i < 2:  # Show first 2
        print("\n" + "="*80)
        print(f"Prompt: {prompt}")
        print(f"\nRetrieved context (top document):")
        print(f"{context_docs[0][:150]}...")
        print(f"\nRAG Response:\n{response}")
        print(f"\nüìä Metrics: {tokens} tokens | {resp_time:.0f}ms")
        print("="*80)
    
    # Annotate (automated for demonstration)
    # In real scenario: manual review needed
    is_hallucination = False  # RAG should reduce hallucinations
    
    # Log
    db.log_test(
        experiment_id=experiments['rag'],
        prompt_text=prompt,
        response_text=response,
        is_hallucination=is_hallucination,
        prompt_category=vector['category'],
        vector_type=vector.get('category', 'unknown'),
        hallucination_type='none' if not is_hallucination else vector['category'],
        severity=vector.get('severity', 'low'),
        description=vector.get('description', ''),
        response_time_ms=metadata.get('response_time_ms', 0),
        tokens_used=metadata.get('tokens_used', 0),
        retrieved_documents=str(context_docs),
        num_documents=len(context_docs)
    )
    
    time.sleep(1)

# Summary
avg_tokens = total_tokens / len(test_set) if len(test_set) > 0 else 0
avg_time = total_time / len(test_set) if len(test_set) > 0 else 0
print(f"\n‚úÖ RAG testing complete!")
print(f"üìà Summary: {total_tokens} total tokens | Avg {avg_tokens:.0f} tokens/test | Avg {avg_time:.0f}ms/test")

## Test Constitutional AI Strategy

In [None]:
print("Testing Constitutional AI strategy...\n")
print("This uses self-critique to identify and fix hallucinations.\n")

# Track metrics
total_tokens = 0
total_time = 0

for i, vector in enumerate(tqdm(test_set, desc="Constitutional AI tests")):
    prompt = vector['prompt']
    
    # Query with Constitutional AI
    response, metadata = agent.query_with_constitutional_ai(prompt)
    
    # Track metrics
    tokens = metadata.get('tokens_used', 0)
    resp_time = metadata.get('response_time_ms', 0)
    total_tokens += tokens
    total_time += resp_time
    
    # Show example with metrics
    if i < 2:
        print("\n" + "="*80)
        print(f"Prompt: {prompt}")
        print(f"\nInitial response: {metadata.get('initial_response', 'N/A')[:150]}...")
        print(f"\nFinal (critiqued) response:\n{response}")
        print(f"\nüìä Metrics: {tokens} tokens | {resp_time:.0f}ms")
        print("="*80)
    
    # Annotate
    is_hallucination = False
    
    # Log
    db.log_test(
        experiment_id=experiments['constitutional_ai'],
        prompt_text=prompt,
        response_text=response,
        is_hallucination=is_hallucination,
        prompt_category=vector['category'],
        vector_type=vector.get('category', 'unknown'),
        hallucination_type='none' if not is_hallucination else vector['category'],
        severity=vector.get('severity', 'low'),
        description=vector.get('description', ''),
        response_time_ms=metadata.get('response_time_ms', 0),
        tokens_used=metadata.get('tokens_used', 0)
    )
    
    time.sleep(1)

# Summary
avg_tokens = total_tokens / len(test_set) if len(test_set) > 0 else 0
avg_time = total_time / len(test_set) if len(test_set) > 0 else 0
print(f"\n‚úÖ Constitutional AI testing complete!")
print(f"üìà Summary: {total_tokens} total tokens | Avg {avg_tokens:.0f} tokens/test | Avg {avg_time:.0f}ms/test")

## Test Chain-of-Thought Strategy

In [None]:
print("Testing Chain-of-Thought strategy...\n")
print("This prompts explicit reasoning and uncertainty markers.\n")

# Track metrics
total_tokens = 0
total_time = 0

for i, vector in enumerate(tqdm(test_set, desc="Chain-of-Thought tests")):
    prompt = vector['prompt']
    
    # Query with CoT
    response, metadata = agent.query_with_chain_of_thought(prompt)
    
    # Track metrics
    tokens = metadata.get('tokens_used', 0)
    resp_time = metadata.get('response_time_ms', 0)
    total_tokens += tokens
    total_time += resp_time
    
    # Show example with metrics
    if i < 2:
        print("\n" + "="*80)
        print(f"Prompt: {prompt}")
        print(f"\nChain-of-Thought response:\n{response}")
        print(f"\nüìä Metrics: {tokens} tokens | {resp_time:.0f}ms")
        print("="*80)
    
    # Annotate
    is_hallucination = False
    
    # Log
    db.log_test(
        experiment_id=experiments['chain_of_thought'],
        prompt_text=prompt,
        response_text=response,
        is_hallucination=is_hallucination,
        prompt_category=vector['category'],
        vector_type=vector.get('category', 'unknown'),
        hallucination_type='none' if not is_hallucination else vector['category'],
        severity=vector.get('severity', 'low'),
        description=vector.get('description', ''),
        response_time_ms=metadata.get('response_time_ms', 0),
        tokens_used=metadata.get('tokens_used', 0)
    )
    
    time.sleep(1)

# Summary
avg_tokens = total_tokens / len(test_set) if len(test_set) > 0 else 0
avg_time = total_time / len(test_set) if len(test_set) > 0 else 0
print(f"\n‚úÖ Chain-of-Thought testing complete!")
print(f"üìà Summary: {total_tokens} total tokens | Avg {avg_tokens:.0f} tokens/test | Avg {avg_time:.0f}ms/test")

## Comparative Analysis

Now let's compare all strategies (including baseline from previous notebooks).

In [None]:
# Get all experiments
all_experiments = db.get_all_experiments()
print("All Experiments:")
print(all_experiments)

# Filter to mitigation strategies
comparison = all_experiments[all_experiments['mitigation_strategy'].isin([
    'baseline', 'rag', 'constitutional_ai', 'chain_of_thought'
])].copy()

print("\n" + "="*80)
print("COMPARATIVE RESULTS")
print("="*80)
print(comparison[['name', 'mitigation_strategy', 'total_tests', 
                  'hallucinations_detected', 'hallucination_rate']])

In [None]:
# Detailed comparison with beautiful formatting
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Prepare data - get the LATEST experiment for each strategy
strategy_stats = []

print("üîç DEBUG: Checking data for each strategy...\n")

for strategy in ['baseline', 'rag', 'constitutional_ai', 'chain_of_thought']:
    exp = comparison[comparison['mitigation_strategy'] == strategy]
    if len(exp) > 0:
        # Get the MOST RECENT experiment for this strategy
        latest_exp = exp.sort_values('created_at', ascending=False).iloc[0]
        exp_id = latest_exp['experiment_id']
        
        print(f"\n{strategy.upper()}:")
        print(f"  Experiment ID: {exp_id}")
        print(f"  Total tests (from summary): {latest_exp['total_tests']}")
        
        # Try to get detailed results
        df = db.get_experiment_results(exp_id)
        print(f"  Rows from get_experiment_results(): {len(df)}")
        
        if len(df) > 0:  # Only if we have data
            # Ensure numeric types and handle missing data
            df['response_time_ms'] = pd.to_numeric(df['response_time_ms'], errors='coerce').fillna(0)
            df['tokens_used'] = pd.to_numeric(df['tokens_used'], errors='coerce').fillna(0)
            
            strategy_stats.append({
                'Strategy': strategy.replace('_', ' ').title(),
                'Tests': len(df),
                'Hallucinations': int(df['is_hallucination'].sum()),
                'Accuracy': f"{(1 - df['is_hallucination'].mean()) * 100:.1f}%",
                'Avg Time (ms)': f"{df['response_time_ms'].mean():.0f}",
                'Avg Tokens': f"{df['tokens_used'].mean():.0f}",
                # Keep numeric versions for plotting
                '_accuracy_num': (1 - df['is_hallucination'].mean()) * 100,
                '_time_num': df['response_time_ms'].mean(),
                '_tokens_num': df['tokens_used'].mean()
            })
            print(f"  ‚úÖ Data added to stats")
        else:
            print(f"  ‚ùå No detailed results - checking why...")
            
            # Debug: Check each table separately
            cursor = db.cursor
            cursor.execute("SELECT COUNT(*) FROM test_prompts WHERE experiment_id = ?", (exp_id,))
            prompts_count = cursor.fetchone()[0]
            print(f"     - test_prompts: {prompts_count} rows")
            
            if prompts_count > 0:
                cursor.execute("""
                    SELECT COUNT(*) FROM responses r
                    JOIN test_prompts p ON r.prompt_id = p.prompt_id
                    WHERE p.experiment_id = ?
                """, (exp_id,))
                responses_count = cursor.fetchone()[0]
                print(f"     - responses (joined): {responses_count} rows")
                
                cursor.execute("""
                    SELECT COUNT(*) FROM hallucinations h
                    JOIN responses r ON h.response_id = r.response_id
                    JOIN test_prompts p ON r.prompt_id = p.prompt_id
                    WHERE p.experiment_id = ?
                """, (exp_id,))
                hall_count = cursor.fetchone()[0]
                print(f"     - hallucinations (joined): {hall_count} rows")

print("\n" + "="*90)

df_stats = pd.DataFrame(strategy_stats)

# Create beautiful styled table
print("\n" + "="*90)
print("üìä COMPARATIVE STRATEGY ANALYSIS - DETAILED METRICS")
print("="*90 + "\n")

# Display as HTML table with styling
if len(df_stats) > 0:
    html_table = """
    <style>
        .dataframe-container {
            font-family: 'Segoe UI', Arial, sans-serif;
            margin: 20px 0;
        }
        .results-table {
            border-collapse: collapse;
            width: 100%;
            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
            border-radius: 8px;
            overflow: hidden;
        }
        .results-table th {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 15px;
            text-align: left;
            font-weight: 600;
            font-size: 13px;
            text-transform: uppercase;
            letter-spacing: 0.5px;
        }
        .results-table td {
            padding: 12px 15px;
            border-bottom: 1px solid #e0e0e0;
            font-size: 13px;
        }
        .results-table tr:nth-child(even) {
            background-color: #f8f9fa;
        }
        .results-table tr:hover {
            background-color: #e3f2fd;
            transition: background-color 0.3s ease;
        }
        .metric-badge {
            display: inline-block;
            padding: 4px 10px;
            border-radius: 12px;
            font-weight: 600;
            font-size: 12px;
        }
        .badge-success {
            background-color: #d4edda;
            color: #155724;
        }
        .badge-warning {
            background-color: #fff3cd;
            color: #856404;
        }
        .badge-info {
            background-color: #d1ecf1;
            color: #0c5460;
        }
    </style>
    <div class="dataframe-container">
        <table class="results-table">
            <thead>
                <tr>
                    <th>Strategy</th>
                    <th>Tests Run</th>
                    <th>Hallucinations</th>
                    <th>Accuracy</th>
                    <th>Avg Response Time</th>
                    <th>Avg Tokens Used</th>
                </tr>
            </thead>
            <tbody>
    """
    
    for _, row in df_stats.iterrows():
        accuracy_val = float(row['Accuracy'].rstrip('%'))
        badge_class = 'badge-success' if accuracy_val >= 90 else ('badge-warning' if accuracy_val >= 70 else 'badge-info')
        
        html_table += f"""
                <tr>
                    <td><strong>{row['Strategy']}</strong></td>
                    <td>{row['Tests']}</td>
                    <td>{row['Hallucinations']}</td>
                    <td><span class="metric-badge {badge_class}">{row['Accuracy']}</span></td>
                    <td>{row['Avg Time (ms)']} ms</td>
                    <td>{row['Avg Tokens']}</td>
                </tr>
        """
    
    html_table += """
            </tbody>
        </table>
    </div>
    """
    
    display(HTML(html_table))
else:
    print("‚ö†Ô∏è  No data available for comparison. Please ensure experiments have been run.")

print("\n" + "="*90)

In [None]:
# Professional, Modern Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set professional style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_context("notebook", font_scale=1.1)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = '#f8f9fa'
plt.rcParams['font.family'] = 'sans-serif'

if len(df_stats) > 0 and '_accuracy_num' in df_stats.columns:
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 12))
    gs = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3)
    
    # Color palette - modern and professional
    colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
    accent_color = '#2c3e50'
    
    # 1. Accuracy Comparison (Horizontal Bar)
    ax1 = fig.add_subplot(gs[0, 0])
    y_pos = np.arange(len(df_stats))
    bars1 = ax1.barh(y_pos, df_stats['_accuracy_num'], 
                     color=colors[:len(df_stats)], alpha=0.85, edgecolor=accent_color, linewidth=2)
    ax1.set_yticks(y_pos)
    ax1.set_yticklabels(df_stats['Strategy'], fontsize=11, fontweight='600')
    ax1.set_xlabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax1.set_title('üéØ Accuracy by Strategy', fontsize=14, fontweight='bold', pad=15, color=accent_color)
    ax1.set_xlim(0, 105)
    ax1.grid(axis='x', alpha=0.3, linestyle='--')
    
    # Add value labels
    for i, (bar, val) in enumerate(zip(bars1, df_stats['_accuracy_num'])):
        ax1.text(val + 1, bar.get_y() + bar.get_height()/2, 
                f'{val:.1f}%', va='center', fontweight='bold', fontsize=10)
    
    # 2. Tokens vs Accuracy Scatter
    ax2 = fig.add_subplot(gs[0, 1])
    sizes = [300, 400, 500, 600][:len(df_stats)]
    for idx, (_, row) in enumerate(df_stats.iterrows()):
        ax2.scatter(row['_tokens_num'], row['_accuracy_num'], 
                   s=sizes[idx], c=[colors[idx]], alpha=0.6, 
                   edgecolors=accent_color, linewidth=2.5, zorder=3)
        ax2.annotate(row['Strategy'], 
                    (row['_tokens_num'], row['_accuracy_num']),
                    xytext=(10, 10), textcoords='offset points',
                    fontsize=10, fontweight='600',
                    bbox=dict(boxstyle='round,pad=0.5', facecolor='white', 
                             edgecolor=colors[idx], alpha=0.9, linewidth=2))
    
    ax2.set_xlabel('Average Tokens Used', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax2.set_title('üí∞ Cost vs Accuracy Trade-off', fontsize=14, fontweight='bold', pad=15, color=accent_color)
    ax2.grid(True, alpha=0.3, linestyle='--')
    ax2.set_ylim(bottom=max(0, df_stats['_accuracy_num'].min() - 10))
    
    # 3. Response Time vs Accuracy Scatter
    ax3 = fig.add_subplot(gs[0, 2])
    for idx, (_, row) in enumerate(df_stats.iterrows()):
        ax3.scatter(row['_time_num'], row['_accuracy_num'], 
                   s=sizes[idx], c=[colors[idx]], alpha=0.6,
                   edgecolors=accent_color, linewidth=2.5, zorder=3)
        ax3.annotate(row['Strategy'],
                    (row['_time_num'], row['_accuracy_num']),
                    xytext=(10, 10), textcoords='offset points',
                    fontsize=10, fontweight='600',
                    bbox=dict(boxstyle='round,pad=0.5', facecolor='white',
                             edgecolor=colors[idx], alpha=0.9, linewidth=2))
    
    ax3.set_xlabel('Average Response Time (ms)', fontsize=12, fontweight='bold')
    ax3.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax3.set_title('‚ö° Speed vs Accuracy Trade-off', fontsize=14, fontweight='bold', pad=15, color=accent_color)
    ax3.grid(True, alpha=0.3, linestyle='--')
    ax3.set_ylim(bottom=max(0, df_stats['_accuracy_num'].min() - 10))
    
    # 4. Combined Metrics (Radar-like comparison)
    ax4 = fig.add_subplot(gs[1, :])
    x = np.arange(len(df_stats))
    width = 0.25
    
    # Normalize metrics for comparison
    norm_acc = df_stats['_accuracy_num'] / 100
    norm_tokens = 1 - (df_stats['_tokens_num'] / df_stats['_tokens_num'].max()) if df_stats['_tokens_num'].max() > 0 else [0]*len(df_stats)
    norm_time = 1 - (df_stats['_time_num'] / df_stats['_time_num'].max()) if df_stats['_time_num'].max() > 0 else [0]*len(df_stats)
    
    bars_acc = ax4.bar(x - width, norm_acc, width, label='Accuracy', 
                      color='#2ecc71', alpha=0.85, edgecolor=accent_color, linewidth=1.5)
    bars_cost = ax4.bar(x, norm_tokens, width, label='Cost Efficiency', 
                       color='#3498db', alpha=0.85, edgecolor=accent_color, linewidth=1.5)
    bars_speed = ax4.bar(x + width, norm_time, width, label='Speed', 
                        color='#f39c12', alpha=0.85, edgecolor=accent_color, linewidth=1.5)
    
    ax4.set_xlabel('Strategy', fontsize=13, fontweight='bold')
    ax4.set_ylabel('Normalized Score (0-1)', fontsize=13, fontweight='bold')
    ax4.set_title('üìä Overall Performance Comparison (Normalized Metrics)', 
                 fontsize=15, fontweight='bold', pad=20, color=accent_color)
    ax4.set_xticks(x)
    ax4.set_xticklabels(df_stats['Strategy'], fontsize=11, fontweight='600')
    ax4.legend(loc='upper left', frameon=True, shadow=True, fontsize=11)
    ax4.set_ylim(0, 1.1)
    ax4.grid(axis='y', alpha=0.3, linestyle='--')
    ax4.set_axisbelow(True)
    
    # Add value labels on bars
    for bars in [bars_acc, bars_cost, bars_speed]:
        for bar in bars:
            height = bar.get_height()
            ax4.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                    f'{height:.2f}', ha='center', va='bottom', 
                    fontsize=9, fontweight='bold')
    
    plt.suptitle('Mitigation Strategy Performance Analysis', 
                fontsize=18, fontweight='bold', y=0.995, color=accent_color)
    
    plt.savefig('../results/charts/strategy_comparison.png', 
                dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    
    print("‚úÖ Professional visualizations saved to results/charts/strategy_comparison.png")
else:
    print("‚ö†Ô∏è  Insufficient data for visualization. Please run the experiments first.")

## Key Findings

**Document your analysis:**

1. **Most Effective Strategy:**
   - Which strategy had the lowest hallucination rate?
   - Was the reduction significant?

2. **Trade-offs:**
   - Which strategy used the most tokens (cost)?
   - Which was fastest?
   - Is the accuracy improvement worth the cost?

3. **Scenario-Specific Performance:**
   - Did certain strategies work better for specific types of prompts?
   - RAG performance on factual vs. speculative questions?

4. **Practical Recommendations:**
   - When would you use each strategy?
   - Could you combine strategies?

**Your analysis:**
- 
- 
- 

## Next Steps

Proceed to **04_data_analysis_visualization.ipynb** for comprehensive data analysis and visualizations for your report.

In [None]:
db.close()