In [None]:
# # Auto-Update: Sync with latest repository changes
# import subprocess
# import sys
# from pathlib import Path

# def update_repository():
#     """Update the repository and reinstall the package if needed."""
#     try:
#         print("🔄 Checking for repository updates...")
        
#         # Check if we're in a git repository
#         repo_root = Path.cwd()
#         while repo_root != repo_root.parent and not (repo_root / '.git').exists():
#             repo_root = repo_root.parent
            
#         if not (repo_root / '.git').exists():
#             print("⚠️  Not in a git repository, skipping update")
#             return
            
#         # Fetch latest changes
#         result = subprocess.run(['git', 'fetch'], capture_output=True, text=True, cwd=repo_root)
#         if result.returncode != 0:
#             print(f"⚠️  Git fetch failed: {result.stderr}")
#             return
            
#         # Check if there are updates
#         result = subprocess.run(['git', 'status', '-uno'], capture_output=True, text=True, cwd=repo_root)
#         if "Your branch is behind" in result.stdout:
#             print("📥 Updates found, pulling latest changes...")
            
#             # Pull latest changes
#             pull_result = subprocess.run(['git', 'pull'], capture_output=True, text=True, cwd=repo_root)
#             if pull_result.returncode != 0:
#                 print(f"❌ Git pull failed: {pull_result.stderr}")
#                 return
                
#             print("✅ Repository updated successfully")
            
#             # Check if pyproject.toml or requirements changed
#             changed_files = subprocess.run(['git', 'diff', 'HEAD@{1}', '--name-only'], 
#                                          capture_output=True, text=True, cwd=repo_root)
            
#             if any(file in changed_files.stdout for file in ['pyproject.toml', 'requirements.txt', 'setup.py']):
#                 print("📦 Dependencies changed, reinstalling package...")
                
#                 # Reinstall in development mode
#                 install_result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-e', '.'], 
#                                               capture_output=True, text=True, cwd=repo_root)
                
#                 if install_result.returncode == 0:
#                     print("✅ Package reinstalled successfully")
#                 else:
#                     print(f"⚠️  Package reinstall failed: {install_result.stderr}")
                    
#             print("🔄 Please restart kernel if major changes were made")
#         else:
#             print("✅ Repository is up to date")
            
#     except Exception as e:
#         print(f"❌ Update failed: {e}")

# # Run update check
# update_repository()

# Deep Search Agent Testing Notebook

A streamlined testing environment for the DeepLitSearchAgent with automatic repository updates.

## Quick Start
1. **Auto-Update**: The cell above automatically syncs with the latest repository changes
2. **Setup**: Run the imports cell below
3. **Configure**: Adjust parameters in the configuration section  
4. **Test**: Set your query and run the search
5. **Analyze**: Review results in the analysis section

## What is DeepLitSearchAgent?
An advanced multi-agent system for literature search with:
- **Iterative refinement** - Improves search quality over multiple rounds
- **Quality assessment** - Filters results using relevancy scoring  
- **Source validation (ISSN whitelist)** - Ensures high-quality peer-reviewed sources
- **Comprehensive synthesis** - Generates research reports from findings

---
**⚠️ Note**: If you see "Please restart kernel" after updates, restart your Jupyter kernel to use the latest code.

# 1. Setup and Imports

In [None]:
import asyncio
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')

# AKD imports
from akd.agents.search.deep_search import DeepLitSearchAgent, DeepLitSearchAgentConfig
from akd.agents.search._base import LitSearchAgentInputSchema
from akd.tools.search.searxng_search import SearxNGSearchTool, SearxNGSearchToolConfig, SearxNGSearchToolInputSchema

print("✅ Setup complete!")

In [None]:
TRIAGE_AGENT_PROMPT = """IDENTITY and PURPOSE:
You are an expert query triage specialist who determines the optimal path for research requests. Your role is to quickly assess whether a query has sufficient context for immediate research or needs clarification first.

DECISION CRITERIA:
1. **Needs Clarification If:**
   - The query is too vague or broad (e.g., "Tell me about AI")
   - Key parameters are missing (timeframe, scope, specific aspects)
   - Multiple interpretations are possible
   - The research goal or intended use is unclear

2. **Ready for Instructions If:**
   - The query has clear scope and boundaries
   - Specific aspects or questions are identified
   - The depth/type of research needed is apparent
   - Any ambiguity wouldn't significantly impact research quality

3. **Direct Research (Rare) If:**
   - The query is extremely specific and detailed
   - All necessary context is provided
   - No clarification could improve the research

OUTPUT INSTRUCTIONS:
- Make a quick, decisive routing decision
- Provide brief reasoning (1-2 sentences)
- Err on the side of clarity - better to clarify than to research the wrong thing
- Consider the research domain (scientific, technical, historical, etc.)"""

DEEP_RESEARCH_AGENT_PROMPT = """IDENTITY and PURPOSE:
You are an expert deep research agent with advanced capabilities in scientific literature search, synthesis, and analysis. You perform comprehensive, iterative research to produce high-quality, evidence-based reports.

CORE CAPABILITIES:
1. **Iterative Search Strategy**
   - Start with broad searches to understand the landscape
   - Progressively refine queries based on initial findings
   - Identify and pursue promising research threads
   - Recognize when sufficient depth has been achieved

2. **Source Evaluation**
   - Prioritize peer-reviewed and authoritative sources
   - Assess credibility and potential biases
   - Note publication dates and relevance
   - Identify consensus vs. controversial findings

   - Connect findings across multiple sources
   - Identify patterns, trends, and relationships
   - Highlight contradictions or conflicting evidence
   - Draw evidence-based conclusions

4. **Research Quality Assurance**
3. **Synthesis and Analysis**
   - Maintain scientific rigor throughout
   - Provide proper attribution for all claims
   - Acknowledge limitations and gaps
   - Avoid overgeneralization or speculation

RESEARCH PROCESS:
1. Parse and understand the detailed research instructions
2. Plan initial search strategy and keywords
3. Execute searches and evaluate results
4. Identify knowledge gaps and refine approach
5. Iterate until quality threshold is met
6. Synthesize findings into comprehensive report

OUTPUT REQUIREMENTS:
- Well-structured research report with clear sections
- Executive summary of key findings
- Detailed evidence with proper citations
- Identification of gaps or areas for future research
- Objective presentation of conflicting viewpoints
- Tables, comparisons, or visualizations where helpful

QUALITY STANDARDS:
- Comprehensive coverage of the topic
- Balanced representation of different perspectives
- Clear distinction between evidence and interpretation
- Appropriate depth for the intended use
- Professional, academic writing style"""

CLARIFYING_AGENT_PROMPT = """IDENTITY and PURPOSE:
You are an expert research assistant who helps users clarify their research requests to ensure comprehensive and accurate results.

If the user hasn't specifically asked for research (unlikely), ask them what research they would like you to do.

GUIDELINES:
1. **Be concise while gathering all necessary information** 
   - Ask 2–3 clarifying questions to gather more context for research
   - Make sure to gather all the information needed to carry out the research task in a concise, well-structured manner
   - Use bullet points or numbered lists if appropriate for clarity
   - Don't ask for unnecessary information, or information that the user has already provided

2. **Maintain a Friendly and Professional Tone**
   - For example, instead of saying "I need a bit more detail on Y," say, "Could you share more detail on Y?"
   - Be encouraging and show genuine interest in helping with the research

3. **Focus on Research-Relevant Clarifications**
   - Ask about scope, depth, time period, specific aspects of interest
   - Clarify any ambiguous terms or concepts
   - Understand the intended use or application of the research

OUTPUT INSTRUCTIONS:
- Return 2-3 focused clarifying questions
- Each question should help narrow down or better define the research scope
- Questions should be clear and easy to answer"""

RESEARCH_INSTRUCTION_AGENT_PROMPT = """IDENTITY and PURPOSE:
You are an expert research instruction designer who transforms user queries and clarifications into detailed, actionable research briefs for deep research execution.

Based on the following guidelines, take the users query (and any clarifications), and rewrite it into detailed research instructions. OUTPUT ONLY THE RESEARCH INSTRUCTIONS, NOTHING ELSE.

GUIDELINES:
1. **Maximize Specificity and Detail**
   - Include all known user preferences and explicitly list key attributes or dimensions to consider
   - It is of utmost importance that all details from the user are included in the expanded prompt
   - Be explicit about depth, breadth, and type of analysis required

2. **Fill in Unstated But Necessary Dimensions as Open-Ended**
   - If certain attributes are essential for meaningful output but the user has not provided them, explicitly state that they are open-ended or default to "no specific constraint"
   - Guide the research to explore these dimensions comprehensively

3. **Avoid Unwarranted Assumptions**
   - If the user has not provided a particular detail, do not invent one
   - Instead, state the lack of specification and guide the deep research model to treat it as flexible or accept all possible options

4. **Use the First Person**
   - Phrase the request from the perspective of the user
   - Example: "I need research on..." rather than "The user needs..."

5. **Structure and Formatting Requirements**
   - Explicitly request appropriate headers and formatting for clarity
   - If the research would benefit from tables, comparisons, or structured data, explicitly request them
   - Examples of when to request tables:
     - Comparing multiple options, methodologies, or approaches
     - Summarizing key findings across multiple sources
     - Presenting timeline or chronological information
     - Showing statistical data or numerical comparisons

6. **Source Requirements**
   - Specify preference for peer-reviewed sources, primary research, or authoritative publications
   - Request proper citations and attribution for all claims
   - If domain-specific sources are important, mention them explicitly

7. **Language and Style**
   - Maintain scientific rigor and objectivity
   - Request evidence-based conclusions
   - Ask for identification of conflicting viewpoints or contradictory evidence

8. **Expected Deliverables**
   - Be clear about what constitutes a complete research output
   - Specify if synthesis, analysis, or recommendations are needed
   - Request identification of gaps or areas needing further research

IMPORTANT: Ensure the instructions are comprehensive yet focused on the user's actual needs"""

QUERY_SYSTEM_PROMPT = """IDENTITY and PURPOSE:
You are an expert scientific search engine query generator with a deep understanding of which queries will maximize the number of relevant results for science.

INTERNAL ASSISTANT STEPS:
- Analyze the given instruction to identify key concepts and aspects that need to be researched.
- For each aspect, craft a search query using appropriate search operators and syntax.
- Ensure queries cover different angles of the topic (technical, practical, comparative, etc.).

OUTPUT INSTRUCTIONS:
- Return exactly the requested number of queries.
- Format each query like a search engine query, not a natural language question.
- Each query should be a concise string of keywords and operators."""


# Follow-up query generation focused on gap-closing and diversification
FOLLOWUP_QUERY_SYSTEM_PROMPT = """IDENTITY and PURPOSE:
You are an expert follow-up query generator. Given the original queries that were already tried and a synthesized content summary of what they retrieved, propose new search queries that:
- Close gaps, surface missing perspectives, and reduce redundancy
- Target higher-evidence sources (peer-reviewed, meta-analyses), and recent works when appropriate
- Explore alternative methods, datasets, benchmarks, and negative/contradictory evidence

INPUTS YOU WILL RECEIVE:
- original_queries: the list of queries already used
- content: a compact summary of what was found (titles, brief snippets, instructions context)
- num_queries: the exact number of follow-up queries to return

GUIDELINES:
1. Maximize novelty and coverage relative to original_queries and the provided content.
2. Prefer queries that increase evidence quality, methodological rigor, and recency when relevant.
3. Propose targeted queries (concise keyword/operator strings), not natural language questions.
4. Consider synonyms, controlled vocabulary (e.g., MeSH terms for biomed), and domain-specific operators.
5. Include disconfirming or boundary-case queries where useful (e.g., failure modes, limitations, critiques).
6. Avoid near-duplicates of original_queries and avoid repeating obviously saturated angles.

OUTPUT INSTRUCTIONS:
- Return exactly num_queries follow-up queries.
- Each query must be a concise search-engine-style string of keywords and operators.
"""

# 2. Configuration

**Edit these parameters to customize the search behavior:**

In [None]:
# === SEARCH CONFIGURATION ===
USER_QUERY = "Find research papers on studies that use climate and hydrological modeling, LiDAR-derived snowpack data, and precipitation."

# Core Search Parameters
MAX_RESEARCH_ITERATIONS = 1       # Number of search refinement cycles (1-10)
QUALITY_THRESHOLD = 0.7            # Stop when this quality is reached (0.0-1.0)
MIN_RELEVANCY_SCORE = 0.3          # Minimum score to include results (0.0-1.0)

# Advanced Features  
USE_SEMANTIC_SCHOLAR = True        # Include academic papers from Semantic Scholar
SOURCE_VALIDATION = True           # Enable source validation (ISSN-based whitelist) -- leads to 75 - 85% filtering of links
ENABLE_FULL_CONTENT_SCRAPING = True # Fetch full content for high-scoring results
FULL_CONTENT_THRESHOLD = 0.7       # Score threshold for full content fetch

# SearxNG Configuration (used by both Deep Search and direct SearxNG test)
SEARXNG_ENGINES = ["crossref", "arxiv", "google_scholar", "semantic_scholar"]  # Search engines to use
SEARXNG_MAX_RESULTS = 50           # Maximum results to fetch
SEARXNG_MAX_PAGES = 5              # Maximum pages to search
SEARXNG_RESULTS_PER_PAGE = 10      # Results per page
SEARXNG_SCORE_CUTOFF = 0.25        # Minimum score threshold

# Debug and Performance
DEBUG_MODE = True                  # Enable detailed logging
MAX_RESULTS_TO_DISPLAY = 1000      # Limit display (None = show all)

print("📊 Configuration loaded:")
print(f"   Query: {USER_QUERY[:50]}{'...' if len(USER_QUERY) > 50 else ''}")
print(f"   Max Iterations: {MAX_RESEARCH_ITERATIONS}")
print(f"   Quality Threshold: {QUALITY_THRESHOLD}")
print(f"   Source Validation: {SOURCE_VALIDATION}")
print(f"   SearxNG Engines: {SEARXNG_ENGINES}")
print(f"   SearxNG Max Results: {SEARXNG_MAX_RESULTS}")
print(f"   Debug Mode: {DEBUG_MODE}")

# 3. Run Deep Search

In [None]:
# Run Deep Search Test
async def run_deep_search(user_query):
    print("🚀 Initializing DeepLitSearchAgent...")
    searxng_config = SearxNGSearchToolConfig(
        max_results=SEARXNG_MAX_RESULTS,
        engines=SEARXNG_ENGINES,
        max_pages=SEARXNG_MAX_PAGES,
        results_per_page=SEARXNG_RESULTS_PER_PAGE,
        score_cutoff=SEARXNG_SCORE_CUTOFF,
        debug=DEBUG_MODE
    )
    search_tool = SearxNGSearchTool(config=searxng_config)
    # Create configuration
    config = DeepLitSearchAgentConfig(
        max_research_iterations=MAX_RESEARCH_ITERATIONS,
        quality_threshold=QUALITY_THRESHOLD,
        min_relevancy_score=MIN_RELEVANCY_SCORE,
        use_semantic_scholar=USE_SEMANTIC_SCHOLAR,
        search_tool=search_tool,
        source_validation=SOURCE_VALIDATION,
        enable_full_content_scraping=ENABLE_FULL_CONTENT_SCRAPING,
        full_content_threshold=FULL_CONTENT_THRESHOLD,
        enable_streaming=False,
        debug=DEBUG_MODE
    )

    # Initialize and run
    agent = DeepLitSearchAgent(config=config, search_tool=search_tool, debug=DEBUG_MODE)
    agent_input = LitSearchAgentInputSchema(query=user_query, category="science")

    print("🔎 Running deep search...")
    output = await agent.arun(agent_input)

    # Results summary
    num_results = len(output.results)
    iterations = getattr(output, "iterations_performed", 1)
    has_report = output.results and output.results[0].get("url") == "deep-search://report"

    print(f"✅ Search complete!")
    print(f"   📊 Total results: {num_results}")
    print(f"   🔄 Iterations: {iterations}")
    print(f"   📋 Research report: {'Yes' if has_report else 'No'}")

    # Save results - create directory first
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    results_dir = Path("notebooks")
    results_dir.mkdir(exist_ok=True)  # Create directory if it doesn't exist
    results_file = results_dir / f"deep_search_results_{timestamp}.json"

    with open(results_file, 'w') as f:
        json.dump({
            "query": user_query,
            "category": output.category,
            "iterations_performed": iterations,
            "results": output.results
        }, f, indent=2, ensure_ascii=False)

    print(f"💾 Results saved to: {results_file}")
    return output

In [None]:
USER_QUERY = "Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow."

output = await run_deep_search(
    USER_QUERY
)
print(output)

# 4. Results Synthesis

In [None]:
# Display Research Report (if generated)
if 'output' in globals() and output.results:
    # Check for research report
    report = None
    if output.results[0].get("url") == "deep-research://report":
        report = output.results[0]
    
    if report:
        print("📋 RESEARCH SYNTHESIS REPORT")
        print("=" * 80)
        content = report.get("content", "No content available")
        print(content)
        print("=" * 80)
    else:
        print("📋 No research synthesis report generated")
        
else:
    print("⚠️  No results available. Run the search cell first!")

## References for Synthesis

In [None]:
# Display Search Results Summary
if 'output' in globals() and output.results:
    search_results = [r for r in output.results if r.get("url") != "deep-research://report"]
    
    if search_results:
        print(f"🔍 SEARCH RESULTS SUMMARY ({len(search_results)} papers)")
        print("=" * 80)
        
        # Calculate quality metrics
        scores = [r.get("relevancy_score") for r in search_results if isinstance(r.get("relevancy_score"), (int, float))]
        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"📈 Quality Metrics: Avg={avg_score:.2f}, Min={min(scores):.2f}, Max={max(scores):.2f}")
            print("-" * 40)
        
        # Display results (limited by MAX_RESULTS_TO_DISPLAY)
        display_count = len(search_results) if MAX_RESULTS_TO_DISPLAY is None else min(MAX_RESULTS_TO_DISPLAY, len(search_results))
        
        for i, result in enumerate(search_results[:display_count]):
            print(f"\n📄 [{i+1}] {result.get('title', 'Untitled')}")
            
            # Show relevancy score
            score = result.get('relevancy_score')
            if score is not None:
                print(f"    📊 Relevancy: {score:.2f}")
            
            # Show author if available
            author = result.get('author')
            if author:
                print(f"    👤 Author: {author}")
                
            # Show URL
            url = result.get('url', '')
            if url:
                print(f"    🔗 {url}")
            
            # Show summary if available
            summary = result.get('summary', '')
            if summary:
                summary_preview = summary[:200] + "..." if len(summary) > 200 else summary
                print(f"    📝 Summary: {summary_preview}")
            
            print("-" * 40)
            
        if MAX_RESULTS_TO_DISPLAY and len(search_results) > MAX_RESULTS_TO_DISPLAY:
            print(f"\n... and {len(search_results) - MAX_RESULTS_TO_DISPLAY} more results")
            
    else:
        print("📋 No search results found")
        
else:
    print("⚠️  No results available. Run the search cell first!")

# 5. SearxNG Direct Testing

Test the underlying SearxNG search tool with the same configuration and Query used by Deep Search.

In [None]:
searxng_config = SearxNGSearchToolConfig(
        max_results=SEARXNG_MAX_RESULTS,
        engines=SEARXNG_ENGINES,
        max_pages=SEARXNG_MAX_PAGES,
        results_per_page=SEARXNG_RESULTS_PER_PAGE,
        score_cutoff=SEARXNG_SCORE_CUTOFF,
        debug=DEBUG_MODE
)
search_tool = SearxNGSearchTool(config=searxng_config)
searxng_input = SearxNGSearchToolInputSchema(queries=[USER_QUERY], category="science", max_results=SEARXNG_MAX_RESULTS)

searxng_output = await search_tool.arun(searxng_input)

print(f"✅ Found {len(searxng_output.results)} results")

# Display results
for i, result in enumerate(searxng_output.results, 1):
    print(f"\n{i}. {result.title}")
    print(f"   {result.url}")
    if hasattr(result, 'content') and result.content:
        content_preview = result.content[:150] + "..." if len(result.content) > 150 else result.content
        print(f"   {content_preview}")
    print("-" * 60)