# Prompt Reconstruction Evaluation Notebook

This notebook provides a comprehensive evaluation framework for prompt reconstruction tasks, integrating multiple evaluation metrics and utilities from the `prompt_reconstruction/src` codebase.

## Features:
- SBERT-based semantic similarity evaluation
- LLM judge-based evaluation
- OBELS (multi-dimensional behavioral similarity) evaluation
- Utility measurement for research reports
- Batch processing of evaluation datasets

## Setup and Imports

In [None]:
# Setup and Imports

import os
import sys
import json
import pandas as pd
import numpy as np
import argparse
from types import SimpleNamespace
import asyncio
import nest_asyncio
from pathlib import Path

# Enable nested event loops for Jupyter
nest_asyncio.apply()

# Add the src directory to Python path
src_path = Path('./src')
if src_path.exists():
    sys.path.append(str(src_path))
    print(f"✅ Added {src_path.resolve()} to Python path")
else:
    print("⚠️  Warning: src directory not found. Make sure to run this notebook from the prompt_reconstruction directory.")
    print(f"Current working directory: {Path.cwd()}")
    print("Expected structure: prompt_reconstruction/src/")
    # Try alternative paths
    alt_paths = ['prompt_reconstruction/src', '../src']
    for alt_path in alt_paths:
        if Path(alt_path).exists():
            sys.path.append(str(Path(alt_path).resolve()))
            print(f"✅ Found and added alternative path: {Path(alt_path).resolve()}")
            break

# Import evaluation utilities
try:
    from sentence_transformers import SentenceTransformer
    from utils.utils_model import get_llm_model
    from utils.utils_eval import (
        compute_sbert_similarity, 
        Compute_LLM_Judge_Similarity,
        Handle_OBELS_Scores,
        HandleWholeFolder,
        process_single_file,
        print_and_save_results
    )
    from measure_utility import (
        custom_report_example,
        load_llm_model,
        calculate_utility_score
    )
    print("✅ Successfully imported evaluation modules")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Make sure you're running this notebook from the correct directory and have installed all dependencies.")
    print("Required packages: sentence-transformers, openai, pandas, numpy")
    print("\nInstall missing packages with:")
    print("pip install sentence-transformers openai pandas numpy matplotlib seaborn")

## Configuration Setup

In [None]:
## Configuration Setup

# Configuration parameters
class Config:
    def __init__(self):
        # API Keys - Set these as environment variables for security
        self.openai_api_key = os.getenv('OPENAI_API_KEY', '')
        self.tavily_api_key = os.getenv('TAVILY_API_KEY', '')
        
        # Model settings
        self.model = "gpt-4o-mini"
        self.temperature = 0.3
        self.max_tokens = 1000
        self.llm_sleep = 1.0  # Delay between LLM calls to avoid rate limiting
        
        # SBERT model settings
        self.sbert_model = "sentence-transformers/all-MiniLM-L6-v2"
        self.cache_folder = None  # Will use default cache
        
        # File paths (update these according to your data)
        self.input_folder = "../data/reconstructed_prompts/"  # Folder containing CSV files to evaluate
        self.output_dir = "../results/evaluation_results/"  # Output directory for results
        
        # Processing settings
        self.start_index = 0  # Index to start processing from

    def validate_setup(self):
        """Validate configuration and provide helpful feedback"""
        issues = []
        
        if not self.openai_api_key:
            issues.append("⚠️  OPENAI_API_KEY not set. LLM-based evaluations will not work.")
        
        if not self.tavily_api_key:
            issues.append("⚠️  TAVILY_API_KEY not set. Utility evaluation will not work.")
        
        if not Path(self.input_folder).exists():
            issues.append(f"⚠️  Input folder not found: {self.input_folder}")
        
        if issues:
            print("Configuration Issues:")
            for issue in issues:
                print(f"  {issue}")
            print("\nTo fix API key issues, set environment variables:")
            print("  export OPENAI_API_KEY='your_openai_key'")
            print("  export TAVILY_API_KEY='your_tavily_key'")
            print("\nOr set them in Python:")
            print("  os.environ['OPENAI_API_KEY'] = 'your_key'")
        else:
            print("✅ Configuration validated successfully")
        
        return len(issues) == 0
        
# Create configuration instance
config = Config()
print(f"Model: {config.model}")
print(f"SBERT Model: {config.sbert_model}")
config.validate_setup()

## Set API Keys (if needed)

In [None]:
## Set API Keys (if needed)

# Option 1: Set via environment variables (recommended)
print("🔑 Setting up API keys...")
print("Option 1 (Recommended): Set as environment variables")
print("  export OPENAI_API_KEY='your_openai_key'")
print("  export TAVILY_API_KEY='your_tavily_key'")

print("\nOption 2: Set in notebook (less secure)")
print("Uncomment and fill in your API keys below:")

# SECURITY WARNING: Never commit notebooks with real API keys to version control!
# os.environ['OPENAI_API_KEY'] = 'your_openai_api_key_here'
# os.environ['TAVILY_API_KEY'] = 'your_tavily_api_key_here'

# Update config with API keys after setting them
config.openai_api_key = os.getenv('OPENAI_API_KEY', '')
config.tavily_api_key = os.getenv('TAVILY_API_KEY', '')

print(f"\nAPI Key Status:")
print(f"  OpenAI API Key: {'✅ Set' if config.openai_api_key else '❌ Not set'}")
print(f"  Tavily API Key: {'✅ Set' if config.tavily_api_key else '❌ Not set'}")

if not config.openai_api_key:
    print("\n💡 To set OpenAI API key in this session:")
    print("   os.environ['OPENAI_API_KEY'] = 'your-key-here'")
    
if not config.tavily_api_key:
    print("\n💡 To set Tavily API key in this session:")
    print("   os.environ['TAVILY_API_KEY'] = 'your-key-here'")

## Initialize Models

In [None]:
## Initialize Models

# Convert config to args-like object for compatibility
args = SimpleNamespace(**config.__dict__)

# Initialize models
llm_model = None
sbert_model = None

print("🔄 Initializing models...")

try:
    # Initialize SBERT model first (this always works)
    print("  Loading SBERT model...")
    sbert_model = SentenceTransformer(config.sbert_model, cache_folder=config.cache_folder)
    print("  ✅ SBERT model loaded successfully")
    
    # Initialize LLM model (requires API key)
    if config.openai_api_key:
        print("  Loading LLM model...")
        llm_model = get_llm_model(args)
        print("  ✅ LLM model loaded successfully")
    else:
        print("  ⚠️  LLM model not loaded (OpenAI API key required)")
        
except Exception as e:
    print(f"  ❌ Error loading models: {e}")
    print(f"     Error type: {type(e).__name__}")
    
    # Try to give helpful error messages
    if "sentence-transformers" in str(e).lower():
        print("     Try installing: pip install sentence-transformers")
    elif "openai" in str(e).lower():
        print("     Try installing: pip install openai")

# Model status summary
print("\n📋 Model Status:")
print(f"  SBERT Model: {'✅ Ready' if sbert_model else '❌ Failed'}")
print(f"  LLM Model: {'✅ Ready' if llm_model else '❌ Not available'}")

# Set environment variable for OpenAI if model loaded successfully
if llm_model and config.openai_api_key:
    os.environ['OPENAI_API_KEY'] = config.openai_api_key

## Single File Evaluation

Evaluate a single CSV file containing original and reconstructed prompts.

In [None]:
def evaluate_single_file(file_path, llm_model, sbert_model, args, save_results=True):
    """
    Evaluate a single CSV file with prompt reconstruction results.
    
    Expected CSV format:
    - 'prompts' or 'Unnamed: 0': Original prompts
    - 'reconstructed_prompt': Reconstructed prompts
    """
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None
    
    try:
        print(f"📊 Evaluating file: {os.path.basename(file_path)}")
        
        # Process the file using the evaluation utilities
        df = process_single_file(file_path, llm_model, sbert_model, args)
        
        if save_results:
            # Save results
            result = print_and_save_results(df, args, os.path.basename(file_path), os.path.dirname(file_path))
            return df, result
        else:
            return df
        
    except Exception as e:
        print(f"❌ Error evaluating file: {e}")
        return None

# Example usage - update the file path to your actual data
sample_file = "../data/sample_reconstructed_prompts.csv"  # Update this path

# Uncomment to run evaluation on a single file
# if os.path.exists(sample_file):
#     result = evaluate_single_file(sample_file, llm_model, sbert_model, args)
#     if result:
#         df, summary = result
#         print("\n📈 Evaluation Summary:")
#         print(f"Average SBERT similarity: {summary['avg_sbert_similarity']:.4f}")
#         print(f"Average LLM judge similarity: {summary['avg_llm_judge_similarity']:.4f}")
# else:
#     print(f"Sample file not found: {sample_file}")
#     print("Please update the file path to point to your actual data.")

## Batch Evaluation

Process multiple CSV files in a folder.

In [None]:
def run_batch_evaluation(input_folder, llm_model, sbert_model, args):
    """
    Run evaluation on all CSV files in the specified folder.
    """
    if not os.path.exists(input_folder):
        print(f"❌ Input folder not found: {input_folder}")
        return
    
    try:
        print(f"🔄 Starting batch evaluation of folder: {input_folder}")
        
        # Create output directory
        os.makedirs(args.output_dir, exist_ok=True)
        
        # Run batch evaluation
        HandleWholeFolder(input_folder, llm_model, sbert_model, args)
        
        print(f"✅ Batch evaluation completed. Results saved to: {args.output_dir}")
        
    except Exception as e:
        print(f"❌ Error in batch evaluation: {e}")

# Example usage
# Uncomment to run batch evaluation
# run_batch_evaluation(config.input_folder, llm_model, sbert_model, args)

print(f"To run batch evaluation, uncomment the line above and update the input folder path:")
print(f"Current input folder: {config.input_folder}")

## Utility Evaluation

Evaluate the utility of research reports generated from different domain visibility levels.

In [None]:
async def evaluate_utility(csv_file_path, llm_model, args):
    """
    Evaluate utility of research reports at different domain visibility levels.
    
    Expected CSV format:
    - 'prompts': Research queries
    - 'domain_sequence': Original domain sequence
    - 'domain_0.2_visibility', 'domain_0.4_visibility', etc.: Domain sequences at different visibility levels
    """
    if not os.path.exists(csv_file_path):
        print(f"❌ File not found: {csv_file_path}")
        return None
    
    try:
        print(f"🔍 Starting utility evaluation for: {os.path.basename(csv_file_path)}")
        
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} rows")
        
        # Process only a subset for demo (can be adjusted)
        start_idx = getattr(args, 'start_index', 0)
        end_idx = min(start_idx + 5, len(df))  # Process only 5 samples for demo
        
        for index in range(start_idx, end_idx):
            row = df.iloc[index]
            print(f"\nProcessing row {index}: {row['prompts'][:50]}...")
            
            query = row["prompts"]
            
            # Generate and evaluate original report
            if 'domain_sequence' in row and pd.notna(row['domain_sequence']):
                org_report = await custom_report_example(query, row["domain_sequence"].split(","))
                df.loc[index, 'org_report'] = org_report
                
                utility_score = calculate_utility_score(llm_model, query, org_report, args.max_tokens)
                df.loc[index, 'org_utility_score'] = utility_score
                print(f"Original utility score: {utility_score}")
            
            # Evaluate different visibility levels
            visibility_levels = ["domain_0.2_visibility", "domain_0.4_visibility", 
                               "domain_0.6_visibility", "domain_0.8_visibility"]
            
            for domain_list_name in visibility_levels:
                if domain_list_name in row and pd.notna(row[domain_list_name]):
                    domain_list = row[domain_list_name].split(",")
                    new_report = await custom_report_example(query, domain_list)
                    df.loc[index, domain_list_name + "_report"] = new_report
                    
                    utility_score = calculate_utility_score(llm_model, query, new_report, args.max_tokens)
                    df.loc[index, domain_list_name + "_utility_score"] = utility_score
                    print(f"{domain_list_name} utility score: {utility_score}")
        
        # Save results
        output_path = os.path.join(args.output_dir, f"utility_results_{os.path.basename(csv_file_path)}")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        df.to_csv(output_path, index=False)
        print(f"✅ Utility evaluation completed. Results saved to: {output_path}")
        
        return df
        
    except Exception as e:
        print(f"❌ Error in utility evaluation: {e}")
        return None

# Example usage
utility_file = "../data/sample_utility_data.csv"  # Update this path

# Uncomment to run utility evaluation
# if os.path.exists(utility_file) and config.tavily_api_key and llm_model:
#     result = await evaluate_utility(utility_file, llm_model, args)
# else:
#     print("Utility evaluation requires:")
#     print(f"- Valid data file (currently: {utility_file})")
#     print(f"- Tavily API key: {'✅' if config.tavily_api_key else '❌'}")
#     print(f"- LLM model: {'✅' if llm_model else '❌'}")

print("To run utility evaluation, uncomment the code above and update the file path.")

## Custom Evaluation Example

Create and evaluate a small sample dataset for demonstration.

In [None]:
# Create sample data for demonstration
sample_data = {
    'prompts': [
        "Find information about climate change impacts on agriculture",
        "Research the best programming languages for machine learning",
        "Look up recent developments in quantum computing"
    ],
    'reconstructed_prompt': [
        "Search for climate change effects on farming and crops",
        "Find top programming languages for AI and ML development",
        "Research latest quantum computing breakthroughs and advances"
    ]
}

sample_df = pd.DataFrame(sample_data)
print("📋 Sample dataset created:")
print(sample_df)

# Evaluate the sample data
if sbert_model:
    print("\n🔍 Computing SBERT similarities...")
    sample_df['sbert_similarity'] = compute_sbert_similarity(sample_df, sbert_model)
    
    print("\n📊 SBERT Similarity Results:")
    for idx, row in sample_df.iterrows():
        print(f"Row {idx+1}: {row['sbert_similarity']:.4f}")
    
    avg_similarity = sample_df['sbert_similarity'].mean()
    print(f"\nAverage SBERT similarity: {avg_similarity:.4f}")

# LLM Judge evaluation (if LLM model is available)
if llm_model and config.openai_api_key:
    print("\n🤖 Computing LLM Judge similarities...")
    try:
        sample_df['llm_judge_similarity'] = Compute_LLM_Judge_Similarity(sample_df, llm_model, args)
        
        print("\n📊 LLM Judge Similarity Results:")
        for idx, row in sample_df.iterrows():
            print(f"Row {idx+1}: {row['llm_judge_similarity']:.4f}")
        
        avg_llm_similarity = sample_df['llm_judge_similarity'].mean()
        print(f"\nAverage LLM Judge similarity: {avg_llm_similarity:.4f}")
    except Exception as e:
        print(f"Error computing LLM Judge similarities: {e}")
else:
    print("\n⚠️  LLM Judge evaluation skipped (requires OpenAI API key and LLM model)")

## Results Analysis and Visualization

In [None]:
## Results Analysis and Visualization

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("✅ Visualization libraries imported successfully")
except ImportError as e:
    print(f"❌ Failed to import visualization libraries: {e}")
    print("Install with: pip install matplotlib seaborn")

def analyze_results(df):
    """
    Analyze and visualize evaluation results.
    """
    print("📈 Analysis of Evaluation Results")
    print("=" * 50)
    
    if df.empty:
        print("⚠️  No data to analyze")
        return
    
    print(f"📊 Dataset info: {len(df)} samples")
    
    # Basic statistics
    metric_columns = ['sbert_similarity', 'llm_judge_similarity']
    available_metrics = [col for col in metric_columns if col in df.columns]
    
    if not available_metrics:
        print("⚠️  No similarity metrics found in data")
        return
    
    for metric in available_metrics:
        stats = df[metric].describe()
        print(f"\n{metric.replace('_', ' ').title()} Statistics:")
        print(f"  Count: {stats['count']:.0f}")
        print(f"  Mean:  {stats['mean']:.4f}")
        print(f"  Std:   {stats['std']:.4f}")
        print(f"  Min:   {stats['min']:.4f}")
        print(f"  Max:   {stats['max']:.4f}")
    
    # Visualization (only if matplotlib is available)
    try:
        fig, axes = plt.subplots(1, len(available_metrics), figsize=(6*len(available_metrics), 5))
        if len(available_metrics) == 1:
            axes = [axes]
        
        colors = ['blue', 'green', 'red', 'orange']
        
        for i, metric in enumerate(available_metrics):
            axes[i].hist(df[metric], bins=min(15, len(df)//2), alpha=0.7, color=colors[i])
            axes[i].set_title(f'{metric.replace("_", " ").title()} Distribution')
            axes[i].set_xlabel('Similarity Score')
            axes[i].set_ylabel('Frequency')
            axes[i].grid(True, alpha=0.3)
            
            # Add mean line
            mean_val = df[metric].mean()
            axes[i].axvline(mean_val, color='red', linestyle='--', 
                          label=f'Mean: {mean_val:.3f}', alpha=0.8)
            axes[i].legend()
        
        plt.tight_layout()
        plt.show()
        
        # Correlation analysis if multiple metrics available
        if len(available_metrics) >= 2:
            correlation = df[available_metrics[0]].corr(df[available_metrics[1]])
            print(f"\n🔗 Correlation between {available_metrics[0]} and {available_metrics[1]}: {correlation:.4f}")
            
            plt.figure(figsize=(8, 6))
            plt.scatter(df[available_metrics[0]], df[available_metrics[1]], alpha=0.7)
            plt.xlabel(available_metrics[0].replace('_', ' ').title())
            plt.ylabel(available_metrics[1].replace('_', ' ').title())
            plt.title(f'Correlation Analysis (r={correlation:.3f})')
            plt.plot([0, 1], [0, 1], 'r--', alpha=0.5, label='Perfect correlation')
            plt.grid(True, alpha=0.3)
            plt.legend()
            plt.show()
            
    except Exception as e:
        print(f"⚠️  Visualization failed: {e}")
    
    print("\n" + "="*50)

# Demo analysis with sample data (if no real results available yet)
if 'sample_df' in locals() and 'sbert_similarity' in sample_df.columns:
    print("📊 Analyzing sample results...")
    analyze_results(sample_df)
else:
    print("ℹ️  Run evaluation cells above to generate results for analysis")
    print("   This function will be available once you have evaluation results")

## Summary and Next Steps

In [None]:
print("🎯 Evaluation Notebook Summary")
print("=" * 40)

print("\n✅ Available Evaluation Methods:")
print("1. SBERT-based semantic similarity")
print("2. LLM judge-based similarity evaluation")
print("3. OBELS multi-dimensional behavioral similarity")
print("4. Utility evaluation for research reports")

print("\n📁 Key Functions:")
print("- evaluate_single_file(): Evaluate one CSV file")
print("- run_batch_evaluation(): Process multiple files in a folder")
print("- evaluate_utility(): Measure research report utility")
print("- analyze_results(): Analyze and visualize results")

print("\n🔧 Configuration:")
print(f"- Model: {config.model}")
print(f"- SBERT Model: {config.sbert_model}")
print(f"- Output Directory: {config.output_dir}")
print(f"- OpenAI API: {'✅' if config.openai_api_key else '❌ Not configured'}")
print(f"- Tavily API: {'✅' if config.tavily_api_key else '❌ Not configured'}")

print("\n🚀 Next Steps:")
print("1. Set your API keys in the configuration section")
print("2. Update file paths to point to your actual data")
print("3. Uncomment and run the evaluation functions")
print("4. Analyze results using the provided visualization tools")

print("\n📊 Expected Data Format:")
print("CSV files should contain:")
print("- 'prompts' or 'Unnamed: 0': Original prompts")
print("- 'reconstructed_prompt': Reconstructed prompts")
print("- For utility evaluation: domain visibility columns")

print("\n💡 Tips:")
print("- Use environment variables for API keys for security")
print("- Start with small datasets to test the pipeline")
print("- Monitor API usage to avoid rate limits")
print("- Save intermediate results frequently")