In [1]:
# # Auto-Update: Sync with latest repository changes
# import subprocess
# import sys
# from pathlib import Path

# def update_repository():
#     """Update the repository and reinstall the package if needed."""
#     try:
#         print("🔄 Checking for repository updates...")
        
#         # Check if we're in a git repository
#         repo_root = Path.cwd()
#         while repo_root != repo_root.parent and not (repo_root / '.git').exists():
#             repo_root = repo_root.parent
            
#         if not (repo_root / '.git').exists():
#             print("⚠️  Not in a git repository, skipping update")
#             return
            
#         # Fetch latest changes
#         result = subprocess.run(['git', 'fetch'], capture_output=True, text=True, cwd=repo_root)
#         if result.returncode != 0:
#             print(f"⚠️  Git fetch failed: {result.stderr}")
#             return
            
#         # Check if there are updates
#         result = subprocess.run(['git', 'status', '-uno'], capture_output=True, text=True, cwd=repo_root)
#         if "Your branch is behind" in result.stdout:
#             print("📥 Updates found, pulling latest changes...")
            
#             # Pull latest changes
#             pull_result = subprocess.run(['git', 'pull'], capture_output=True, text=True, cwd=repo_root)
#             if pull_result.returncode != 0:
#                 print(f"❌ Git pull failed: {pull_result.stderr}")
#                 return
                
#             print("✅ Repository updated successfully")
            
#             # Check if pyproject.toml or requirements changed
#             changed_files = subprocess.run(['git', 'diff', 'HEAD@{1}', '--name-only'], 
#                                          capture_output=True, text=True, cwd=repo_root)
            
#             if any(file in changed_files.stdout for file in ['pyproject.toml', 'requirements.txt', 'setup.py']):
#                 print("📦 Dependencies changed, reinstalling package...")
                
#                 # Reinstall in development mode
#                 install_result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-e', '.'], 
#                                               capture_output=True, text=True, cwd=repo_root)
                
#                 if install_result.returncode == 0:
#                     print("✅ Package reinstalled successfully")
#                 else:
#                     print(f"⚠️  Package reinstall failed: {install_result.stderr}")
                    
#             print("🔄 Please restart kernel if major changes were made")
#         else:
#             print("✅ Repository is up to date")
            
#     except Exception as e:
#         print(f"❌ Update failed: {e}")

# # Run update check
# update_repository()

# Deep Search Agent Testing Notebook

A streamlined testing environment for the DeepLitSearchAgent with automatic repository updates.

## Quick Start
1. **Auto-Update**: The cell above automatically syncs with the latest repository changes
2. **Setup**: Run the imports cell below
3. **Configure**: Adjust parameters in the configuration section  
4. **Test**: Set your query and run the search
5. **Analyze**: Review results in the analysis section

## What is DeepLitSearchAgent?
An advanced multi-agent system for literature search with:
- **Iterative refinement** - Improves search quality over multiple rounds
- **Quality assessment** - Filters results using relevancy scoring  
- **Source validation (ISSN whitelist)** - Ensures high-quality peer-reviewed sources
- **Comprehensive synthesis** - Generates research reports from findings

---
**⚠️ Note**: If you see "Please restart kernel" after updates, restart your Jupyter kernel to use the latest code.

# 1. Setup and Imports

In [2]:
import asyncio
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')

# AKD imports
from akd.agents.search.deep_search import DeepLitSearchAgent, DeepLitSearchAgentConfig
from akd.agents.search._base import LitSearchAgentInputSchema
from akd.tools.search.searxng_search import SearxNGSearchTool, SearxNGSearchToolConfig, SearxNGSearchToolInputSchema

print("✅ Setup complete!")

✅ Setup complete!


In [None]:
# Deep Research Agent Prompts

CLARIFYING_AGENT_PROMPT = """ROLE:
You are an expert research assistant that elicits only the minimum, high-signal clarifications needed to run a deep literature search.

INPUT:
- The user's current research query and any known context.

GOALS:
1) Reduce ambiguity (scope, timeframe, subtopics, definitions)
2) Capture constraints and preferences (sources, depth, style)
3) Confirm intended outcome (report type, deliverables)

INSTRUCTIONS:
- Ask 2–3 concise questions that directly improve research quality.
- Avoid asking for information already provided.
- Prefer bullet/numbered questions; keep them easy to answer.
- Maintain a professional, encouraging tone.

OUTPUT:
- Return exactly the JSON schema required by the tool (no extra text):
  {"clarifying_questions": string[], "needs_clarification": boolean, "reasoning": string}
"""

RESEARCH_INSTRUCTION_AGENT_PROMPT = """ROLE:
You design precise research instructions for a deep literature search pipeline.

INPUT:
- The user's (possibly enriched) query and any clarifications.

OBJECTIVES:
- Maximize specificity without inventing facts.
- Capture depth, breadth, outputs, and constraints.
- Mark unspecified dimensions as open-ended.
 - Emphasize truthful, evidence-based work and proper source citation.

FORMAT:
- Write in first person ("I need...").
- Include explicit sections: Objectives, Scope, Keywords/Queries, Sources, Methods, Deliverables, Quality/Citation Requirements.
- Ask for tables/comparisons if helpful.

OUTPUT:
- Return exactly the JSON schema required by the tool (no extra text):
  {"research_instructions": string, "search_strategy": string, "key_concepts": string[]}
"""

TRIAGE_AGENT_PROMPT = """ROLE:
You triage research queries to decide: clarify, build instructions, or direct research.

INPUT:
- A single user query string.

DECISION RULES:
- Needs Clarification: vague scope, missing parameters, multiple interpretations, unclear goal.
- Ready for Instructions: clear scope, aspects identified, depth/type apparent.
- Direct Research (rare): extremely specific and complete.

OUTPUT:
- Return exactly the JSON schema required by the tool (no extra text):
  {"routing_decision": "clarify"|"instructions"|"research",
   "needs_clarification": boolean,
   "reasoning": string}
"""

CONTENT_CONDENSATION_PROMPT = """ROLE: Extract only content relevant to the research question.

RESEARCH QUESTION: {research_question}
SOURCE TITLE: {source_title}
SOURCE URL: {source_url}

FULL CONTENT:
{content}

INSTRUCTIONS:
- Keep only directly relevant passages.
- Target ~{target_tokens} tokens. If nothing relevant: output exactly [NO RELEVANT CONTENT].
"""

DEEP_RESEARCH_AGENT_PROMPT = """ROLE:
You synthesize condensed literature into a rigorous research report.

INPUT:
- Research results (titles, URLs, condensed content)
- Research instructions and brief context (iterations, quality)

PROCESS:
- Evaluate credibility and relevance; connect findings; note conflicts and gaps.
 - Verify factuality by cross-checking claims across sources.
 - Cite sources inline for every substantive claim; avoid fabrications.

OUTPUT (STRICT JSON SCHEMA ENFORCEMENT BY TOOL):
- research_report: markdown with sections, citations inline
- key_findings: bullet list
- sources_consulted: list of URLs
- evidence_quality_score: 0.0–1.0
- citations: structured per source

STYLE:
- Evidence-based, objective, academic tone. Distinguish facts vs interpretation.
 - Call out uncertainties and inconsistencies explicitly.
"""

# 2. Configuration

**Edit these parameters to customize the search behavior:**

In [4]:
# === SEARCH CONFIGURATION ===
USER_QUERY = "Find research papers on studies that use climate and hydrological modeling, LiDAR-derived snowpack data, and precipitation."

# Core Search Parameters
MAX_RESEARCH_ITERATIONS = 1       # Number of search refinement cycles (1-10)
QUALITY_THRESHOLD = 0.7            # Stop when this quality is reached (0.0-1.0)
MIN_RELEVANCY_SCORE = 0.3          # Minimum score to include results (0.0-1.0)

# Advanced Features  
USE_SEMANTIC_SCHOLAR = True        # Include academic papers from Semantic Scholar
SOURCE_VALIDATION = True           # Enable source validation (ISSN-based whitelist) -- leads to 75 - 85% filtering of links
ENABLE_FULL_CONTENT_SCRAPING = True # Fetch full content for high-scoring results
FULL_CONTENT_THRESHOLD = 0.7       # Score threshold for full content fetch

# SearxNG Configuration (used by both Deep Search and direct SearxNG test)
SEARXNG_ENGINES = ["crossref", "arxiv", "google_scholar", "semantic_scholar"]  # Search engines to use
SEARXNG_MAX_RESULTS = 50           # Maximum results to fetch
SEARXNG_MAX_PAGES = 5              # Maximum pages to search
SEARXNG_RESULTS_PER_PAGE = 10      # Results per page
SEARXNG_SCORE_CUTOFF = 0.25        # Minimum score threshold

# Debug and Performance
DEBUG_MODE = True                  # Enable detailed logging
MAX_RESULTS_TO_DISPLAY = 1000      # Limit display (None = show all)

print("📊 Configuration loaded:")
print(f"   Query: {USER_QUERY[:50]}{'...' if len(USER_QUERY) > 50 else ''}")
print(f"   Max Iterations: {MAX_RESEARCH_ITERATIONS}")
print(f"   Quality Threshold: {QUALITY_THRESHOLD}")
print(f"   Source Validation: {SOURCE_VALIDATION}")
print(f"   SearxNG Engines: {SEARXNG_ENGINES}")
print(f"   SearxNG Max Results: {SEARXNG_MAX_RESULTS}")
print(f"   Debug Mode: {DEBUG_MODE}")

📊 Configuration loaded:
   Query: Find research papers on studies that use climate a...
   Max Iterations: 1
   Quality Threshold: 0.7
   Source Validation: True
   SearxNG Engines: ['crossref', 'arxiv', 'google_scholar', 'semantic_scholar']
   SearxNG Max Results: 50
   Debug Mode: True


# 3. Run Deep Search

In [5]:
# Run Deep Search Test
async def run_deep_search(user_query):
    print("🚀 Initializing DeepLitSearchAgent...")
    searxng_config = SearxNGSearchToolConfig(
        max_results=SEARXNG_MAX_RESULTS,
        engines=SEARXNG_ENGINES,
        max_pages=SEARXNG_MAX_PAGES,
        results_per_page=SEARXNG_RESULTS_PER_PAGE,
        score_cutoff=SEARXNG_SCORE_CUTOFF,
        debug=DEBUG_MODE
    )
    search_tool = SearxNGSearchTool(config=searxng_config)
    # Create configuration
    config = DeepLitSearchAgentConfig(
        max_research_iterations=MAX_RESEARCH_ITERATIONS,
        quality_threshold=QUALITY_THRESHOLD,
        min_relevancy_score=MIN_RELEVANCY_SCORE,
        use_semantic_scholar=USE_SEMANTIC_SCHOLAR,
        search_tool=search_tool,
        source_validation=SOURCE_VALIDATION,
        enable_full_content_scraping=ENABLE_FULL_CONTENT_SCRAPING,
        full_content_threshold=FULL_CONTENT_THRESHOLD,
        enable_streaming=False,
        debug=DEBUG_MODE
    )

    # Initialize and run
    agent = DeepLitSearchAgent(config=config, search_tool=search_tool, debug=DEBUG_MODE)
    agent_input = LitSearchAgentInputSchema(query=user_query, category="science")

    print("🔎 Running deep search...")
    output = await agent.arun(agent_input)

    # Results summary
    num_results = len(output.results)
    iterations = getattr(output, "iterations_performed", 1)
    has_report = output.results and output.results[0].get("url") == "deep-search://report"

    print(f"✅ Search complete!")
    print(f"   📊 Total results: {num_results}")
    print(f"   🔄 Iterations: {iterations}")
    print(f"   📋 Research report: {'Yes' if has_report else 'No'}")

    # Save results - create directory first
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    results_dir = Path("notebooks")
    results_dir.mkdir(exist_ok=True)  # Create directory if it doesn't exist
    results_file = results_dir / f"deep_search_results_{timestamp}.json"

    with open(results_file, 'w') as f:
        json.dump({
            "query": user_query,
            "category": output.category,
            "iterations_performed": iterations,
            "results": output.results
        }, f, indent=2, ensure_ascii=False)

    print(f"💾 Results saved to: {results_file}")
    return output

In [6]:
USER_QUERY = "Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow."

output = await run_deep_search(
    USER_QUERY
)
print(output)

🚀 Initializing DeepLitSearchAgent...


[32m2025-08-27 10:27:43.545[0m | [34m[1mDEBUG   [0m | [36makd.tools.scrapers.omni[0m:[36m_setup_converter[0m:[36m150[0m - [34m[1mDocling format options :: {<InputFormat.HTML: 'html'>: HTMLFormatOption(pipeline_cls=<class 'docling.pipeline.simple_pipeline.SimplePipeline'>, pipeline_options=PipelineOptions(create_legacy_output=True, document_timeout=None, accelerator_options=AcceleratorOptions(num_threads=4, device='auto', cuda_use_flash_attention2=False), enable_remote_services=False, allow_external_plugins=False), backend=<class 'docling.backend.html_backend.HTMLDocumentBackend'>), <InputFormat.PDF: 'pdf'>: PdfFormatOption(pipeline_cls=<class 'docling.pipeline.standard_pdf_pipeline.StandardPdfPipeline'>, pipeline_options=PdfPipelineOptions(create_legacy_output=True, document_timeout=None, accelerator_options=AcceleratorOptions(num_threads=4, device='auto', cuda_use_flash_attention2=False), enable_remote_services=False, allow_external_plugins=False, artifacts_path=None, ima

🔎 Running deep search...


[32m2025-08-27 10:27:45.221[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.components.triage[0m:[36mprocess[0m:[36m84[0m - [34m[1mTriage decision: research[0m
[32m2025-08-27 10:27:45.222[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.components.triage[0m:[36mprocess[0m:[36m85[0m - [34m[1mNeeds clarification: False[0m
[32m2025-08-27 10:27:45.222[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.components.triage[0m:[36mprocess[0m:[36m86[0m - [34m[1mTriage output preview | reasoning: The query is specific and complete, requesting research papers on a defined topic related to basin-scale water budgets and groundwater's role in streamflow.[0m
[32m2025-08-27 10:27:45.222[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_handle_triage[0m:[36m307[0m - [34m[1mTriage decision: research[0m
[32m2025-08-27 10:27:45.223[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_handle_triage[0m:[36m308[0m 

AttributeError: 'ResearchSynthesisOutputSchema' object has no attribute 'sources_consulted'

# 4. Results Synthesis

In [None]:
# Display Research Report (if generated)
if 'output' in globals() and output.results:
    # Check for research report
    report = None
    if output.results[0].get("url") == "deep-research://report":
        report = output.results[0]
    
    if report:
        print("📋 RESEARCH SYNTHESIS REPORT")
        print("=" * 80)
        content = report.get("content", "No content available")
        print(content)
        print("=" * 80)
    else:
        print("📋 No research synthesis report generated")
        
else:
    print("⚠️  No results available. Run the search cell first!")

📋 RESEARCH SYNTHESIS REPORT
# Research Report on Basin-Scale Water Budgets and Groundwater Contributions to Streamflow

## Introduction
This report synthesizes recent research on basin-scale water budgets with a focus on the role of groundwater as a stable and significant source for streamflow. The aim is to understand the methodologies used to assess groundwater contributions, explore case studies from various basins, and evaluate the implications of groundwater on hydrological models.

## Methodologies for Assessing Groundwater Contributions
Several methodologies have been employed to assess groundwater contributions to streamflow. The Hydrologic Engineering Center-Hydrologic Modeling System (HEC-HMS) has been widely used for both event-based and continuous hydrological modeling. Event-based modeling is effective for short-term hydrological responses, while continuous modeling captures long-term processes such as soil moisture dynamics and groundwater contributions [Hydrology, 2025](

## References for Synthesis

In [None]:
# Display Search Results Summary
if 'output' in globals() and output.results:
    search_results = [r for r in output.results if r.get("url") != "deep-research://report"]
    
    if search_results:
        print(f"🔍 SEARCH RESULTS SUMMARY ({len(search_results)} papers)")
        print("=" * 80)
        
        # Calculate quality metrics
        scores = [r.get("relevancy_score") for r in search_results if isinstance(r.get("relevancy_score"), (int, float))]
        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"📈 Quality Metrics: Avg={avg_score:.2f}, Min={min(scores):.2f}, Max={max(scores):.2f}")
            print("-" * 40)
        
        # Display results (limited by MAX_RESULTS_TO_DISPLAY)
        display_count = len(search_results) if MAX_RESULTS_TO_DISPLAY is None else min(MAX_RESULTS_TO_DISPLAY, len(search_results))
        
        for i, result in enumerate(search_results[:display_count]):
            print(f"\n📄 [{i+1}] {result.get('title', 'Untitled')}")
            
            # Show relevancy score
            score = result.get('relevancy_score')
            if score is not None:
                print(f"    📊 Relevancy: {score:.2f}")
            
            # Show author if available
            author = result.get('author')
            if author:
                print(f"    👤 Author: {author}")
                
            # Show URL
            url = result.get('url', '')
            if url:
                print(f"    🔗 {url}")
            
            # Show summary if available
            summary = result.get('summary', '')
            if summary:
                summary_preview = summary[:200] + "..." if len(summary) > 200 else summary
                print(f"    📝 Summary: {summary_preview}")
            
            print("-" * 40)
            
        if MAX_RESULTS_TO_DISPLAY and len(search_results) > MAX_RESULTS_TO_DISPLAY:
            print(f"\n... and {len(search_results) - MAX_RESULTS_TO_DISPLAY} more results")
            
    else:
        print("📋 No search results found")
        
else:
    print("⚠️  No results available. Run the search cell first!")

🔍 SEARCH RESULTS SUMMARY (9 papers)
📈 Quality Metrics: Avg=0.93, Min=0.35, Max=1.00
----------------------------------------

📄 [1] Insights On Streamflow Predictability Across Scales Using Horizontal Visibility Graph Based Networks
    📊 Relevancy: 0.35
    🔗 http://arxiv.org/abs/1912.03343v1
----------------------------------------

📄 [2] Higher Frozen Soil Permeability Represented in a Hydrological Model Improves Spring Streamflow Prediction From River Basin to Continental Scales
    📊 Relevancy: 1.00
    🔗 https://doi.org/10.1029/2022WR033075
----------------------------------------

📄 [3] Event-Based vs. Continuous Hydrological Modeling with HEC-HMS: A Review of Use Cases, Methodologies, and Performance Metrics
    📊 Relevancy: 1.00
    🔗 https://doi.org/10.3390/hydrology12020039
----------------------------------------

📄 [4] A New Approach for Assessing Groundwater Recharge by Combining GRACE and Baseflow With Case Studies in Karst Areas of Southwest China
    📊 Relevancy: 1.00


# 5. SearxNG Direct Testing

Test the underlying SearxNG search tool with the same configuration and Query used by Deep Search.

In [None]:
searxng_config = SearxNGSearchToolConfig(
        max_results=SEARXNG_MAX_RESULTS,
        engines=SEARXNG_ENGINES,
        max_pages=SEARXNG_MAX_PAGES,
        results_per_page=SEARXNG_RESULTS_PER_PAGE,
        score_cutoff=SEARXNG_SCORE_CUTOFF,
        debug=DEBUG_MODE
)
search_tool = SearxNGSearchTool(config=searxng_config)
searxng_input = SearxNGSearchToolInputSchema(queries=[USER_QUERY], category="science", max_results=SEARXNG_MAX_RESULTS)

searxng_output = await search_tool.arun(searxng_input)

print(f"✅ Found {len(searxng_output.results)} results")

# Display results
for i, result in enumerate(searxng_output.results, 1):
    print(f"\n{i}. {result.title}")
    print(f"   {result.url}")
    if hasattr(result, 'content') and result.content:
        content_preview = result.content[:150] + "..." if len(result.content) > 150 else result.content
        print(f"   {content_preview}")
    print("-" * 60)

[32m2025-08-27 10:23:36.618[0m | [34m[1mDEBUG   [0m | [36makd._base[0m:[36marun[0m:[36m231[0m - [34m[1mRunning SearxNGSearchTool with params: queries=['Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.'] category='science' max_results=50[0m
[32m2025-08-27 10:23:36.619[0m | [1mINFO    [0m | [36makd.tools.search.searxng_search[0m:[36m_arun[0m:[36m293[0m - [1m🔍 SearxNG SEARCH QUERIES (1 total):[0m
[32m2025-08-27 10:23:36.619[0m | [1mINFO    [0m | [36makd.tools.search.searxng_search[0m:[36m_arun[0m:[36m295[0m - [1m  1. 'Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.'[0m
[32m2025-08-27 10:23:36.619[0m | [1mINFO    [0m | [36makd.tools.search.searxng_search[0m:[36m_arun[0m:[36m296[0m - [1m🎯 Target results per query: 50[0m
[32m2025-08-27 10:23:36.620[0m | [1mINFO    [0

✅ Found 32 results

1. The Relationship Between Groundwater Nitrate Pollution and Crime in United States: Nitrate-Crime Hypothesis
   http://arxiv.org/abs/2306.09354v1
   Groundwater is a crucial source of drinking water, but it is often contaminated with water-soluble pollutants that can pose significant health risks. ...
------------------------------------------------------------

2. Insights On Streamflow Predictability Across Scales Using Horizontal Visibility Graph Based Networks
   http://arxiv.org/abs/1912.03343v1
   Streamflow is a dynamical process that integrates water movement in space and time within basin boundaries. The authors characterize the dynamics asso...
------------------------------------------------------------

3. Developments in Water Science (Groundwater Quality - An Important Factor for Selecting Handpumps)
   https://linkinghub.elsevier.com/retrieve/pii/S0167564808705616
------------------------------------------------------------

4. Statistical modeling 