In [6]:
# # Auto-Update: Sync with latest repository changes
# import subprocess
# import sys
# from pathlib import Path

# def update_repository():
#     """Update the repository and reinstall the package if needed."""
#     try:
#         print("🔄 Checking for repository updates...")
        
#         # Check if we're in a git repository
#         repo_root = Path.cwd()
#         while repo_root != repo_root.parent and not (repo_root / '.git').exists():
#             repo_root = repo_root.parent
            
#         if not (repo_root / '.git').exists():
#             print("⚠️  Not in a git repository, skipping update")
#             return
            
#         # Fetch latest changes
#         result = subprocess.run(['git', 'fetch'], capture_output=True, text=True, cwd=repo_root)
#         if result.returncode != 0:
#             print(f"⚠️  Git fetch failed: {result.stderr}")
#             return
            
#         # Check if there are updates
#         result = subprocess.run(['git', 'status', '-uno'], capture_output=True, text=True, cwd=repo_root)
#         if "Your branch is behind" in result.stdout:
#             print("📥 Updates found, pulling latest changes...")
            
#             # Pull latest changes
#             pull_result = subprocess.run(['git', 'pull'], capture_output=True, text=True, cwd=repo_root)
#             if pull_result.returncode != 0:
#                 print(f"❌ Git pull failed: {pull_result.stderr}")
#                 return
                
#             print("✅ Repository updated successfully")
            
#             # Check if pyproject.toml or requirements changed
#             changed_files = subprocess.run(['git', 'diff', 'HEAD@{1}', '--name-only'], 
#                                          capture_output=True, text=True, cwd=repo_root)
            
#             if any(file in changed_files.stdout for file in ['pyproject.toml', 'requirements.txt', 'setup.py']):
#                 print("📦 Dependencies changed, reinstalling package...")
                
#                 # Reinstall in development mode
#                 install_result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-e', '.'], 
#                                               capture_output=True, text=True, cwd=repo_root)
                
#                 if install_result.returncode == 0:
#                     print("✅ Package reinstalled successfully")
#                 else:
#                     print(f"⚠️  Package reinstall failed: {install_result.stderr}")
                    
#             print("🔄 Please restart kernel if major changes were made")
#         else:
#             print("✅ Repository is up to date")
            
#     except Exception as e:
#         print(f"❌ Update failed: {e}")

# # Run update check
# update_repository()

# Deep Search Agent Testing Notebook

A streamlined testing environment for the DeepLitSearchAgent with automatic repository updates.

## Quick Start
1. **Auto-Update**: The cell above automatically syncs with the latest repository changes
2. **Setup**: Run the imports cell below
3. **Configure**: Adjust parameters in the configuration section  
4. **Test**: Set your query and run the search
5. **Analyze**: Review results in the analysis section

## What is DeepLitSearchAgent?
An advanced multi-agent system for literature search with:
- **Iterative refinement** - Improves search quality over multiple rounds
- **Quality assessment** - Filters results using relevancy scoring  
- **ISSN validation** - Ensures high-quality peer-reviewed sources
- **Comprehensive synthesis** - Generates research reports from findings

---
**⚠️ Note**: If you see "Please restart kernel" after updates, restart your Jupyter kernel to use the latest code.

# 1. Setup and Imports

In [7]:
import asyncio
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')

# AKD imports
from akd.agents.search.deep_search import DeepLitSearchAgent, DeepLitSearchAgentConfig
from akd.agents.search._base import LitSearchAgentInputSchema

print("✅ Setup complete!")

✅ Setup complete!


# 2. Configuration

**Edit these parameters to customize the search behavior:**

In [8]:
# === SEARCH CONFIGURATION ===
USER_QUERY = "Find research papers on studies that use climate and hydrological modeling, LiDAR-derived snowpack data, and precipitation."

# Core Search Parameters
MAX_RESEARCH_ITERATIONS = 1       # Number of search refinement cycles (1-10)
QUALITY_THRESHOLD = 0.7            # Stop when this quality is reached (0.0-1.0)
MIN_RELEVANCY_SCORE = 0.3          # Minimum score to include results (0.0-1.0)

# Advanced Features  
USE_SEMANTIC_SCHOLAR = True        # Include academic papers from Semantic Scholar
ENABLE_ISSN_FILTER = True          # Filter to high-quality peer-reviewed journals
ENABLE_FULL_CONTENT_SCRAPING = True # Fetch full content for high-scoring results
FULL_CONTENT_THRESHOLD = 0.7       # Score threshold for full content fetch

# SearxNG Configuration (used by both Deep Search and direct SearxNG test)
SEARXNG_ENGINES = ["google", "arxiv", "google_scholar"]  # Search engines to use
SEARXNG_MAX_RESULTS = 50           # Maximum results to fetch
SEARXNG_MAX_PAGES = 5              # Maximum pages to search
SEARXNG_RESULTS_PER_PAGE = 10      # Results per page
SEARXNG_SCORE_CUTOFF = 0.25        # Minimum score threshold

# Debug and Performance
DEBUG_MODE = True                  # Enable detailed logging
MAX_RESULTS_TO_DISPLAY = 1000      # Limit display (None = show all)

print("📊 Configuration loaded:")
print(f"   Query: {USER_QUERY[:50]}{'...' if len(USER_QUERY) > 50 else ''}")
print(f"   Max Iterations: {MAX_RESEARCH_ITERATIONS}")
print(f"   Quality Threshold: {QUALITY_THRESHOLD}")
print(f"   ISSN Filter: {ENABLE_ISSN_FILTER}")
print(f"   SearxNG Engines: {SEARXNG_ENGINES}")
print(f"   SearxNG Max Results: {SEARXNG_MAX_RESULTS}")
print(f"   Debug Mode: {DEBUG_MODE}")

📊 Configuration loaded:
   Query: Find research papers on studies that use climate a...
   Max Iterations: 1
   Quality Threshold: 0.7
   ISSN Filter: True
   SearxNG Engines: ['google', 'arxiv', 'google_scholar']
   SearxNG Max Results: 50
   Debug Mode: True


# 3. Run Deep Search

In [9]:
# Run Deep Search Test
async def run_deep_search(user_query):
    print("🚀 Initializing DeepLitSearchAgent...")

    # Create configuration
    config = DeepLitSearchAgentConfig(
        max_research_iterations=MAX_RESEARCH_ITERATIONS,
        quality_threshold=QUALITY_THRESHOLD,
        min_relevancy_score=MIN_RELEVANCY_SCORE,
        use_semantic_scholar=USE_SEMANTIC_SCHOLAR,
        enable_issn_filter=ENABLE_ISSN_FILTER,
        enable_full_content_scraping=ENABLE_FULL_CONTENT_SCRAPING,
        full_content_threshold=FULL_CONTENT_THRESHOLD,
        enable_streaming=False,
        debug=DEBUG_MODE
    )

    # Initialize and run
    agent = DeepLitSearchAgent(config=config)
    agent_input = LitSearchAgentInputSchema(query=user_query, category="science")

    print("🔎 Running deep search...")
    output = await agent.arun(agent_input)

    # Results summary
    num_results = len(output.results)
    iterations = getattr(output, "iterations_performed", 1)
    has_report = output.results and output.results[0].get("url") == "deep-search://report"

    print(f"✅ Search complete!")
    print(f"   📊 Total results: {num_results}")
    print(f"   🔄 Iterations: {iterations}")
    print(f"   📋 Research report: {'Yes' if has_report else 'No'}")

    # Save results - create directory first
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    results_dir = Path("notebooks")
    results_dir.mkdir(exist_ok=True)  # Create directory if it doesn't exist
    results_file = results_dir / f"deep_search_results_{timestamp}.json"

    with open(results_file, 'w') as f:
        json.dump({
            "query": user_query,
            "category": output.category,
            "iterations_performed": iterations,
            "results": output.results
        }, f, indent=2, ensure_ascii=False)

    print(f"💾 Results saved to: {results_file}")
    return output

In [10]:
USER_QUERY = "Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow."

output = await run_deep_search(
    USER_QUERY
)
print(output)

🚀 Initializing DeepLitSearchAgent...


[32m2025-08-11 23:22:23.724[0m | [34m[1mDEBUG   [0m | [36makd._base[0m:[36marun[0m:[36m231[0m - [34m[1mRunning DeepLitSearchAgent with params: queries=[] category='science' max_results=20 query='Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.'[0m
[32m2025-08-11 23:22:23.724[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_handle_triage[0m:[36m291[0m - [34m[1mStarting triage for query: Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.[0m


🔎 Running deep search...


[32m2025-08-11 23:22:26.214[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_handle_triage[0m:[36m296[0m - [34m[1mTriage decision: Ready for Instructions[0m
[32m2025-08-11 23:22:26.214[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_handle_triage[0m:[36m297[0m - [34m[1mReasoning: The query has a clear scope focusing on basin-scale water budgets and the role of groundwater in streamflow, which allows for targeted research.[0m
[32m2025-08-11 23:22:26.214[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_build_research_instructions[0m:[36m333[0m - [34m[1mBuilding research instructions[0m
[32m2025-08-11 23:22:33.987[0m | [34m[1mDEBUG   [0m | [36makd.agents.search.deep_search[0m:[36m_build_research_instructions[0m:[36m338[0m - [34m[1mGenerated instructions (1285 chars)[0m
[32m2025-08-11 23:22:35.578[0m | [1mINFO    [0m | [36makd.agents.search.deep_search[0m:[36m_generate_i

✅ Search complete!
   📊 Total results: 35
   🔄 Iterations: 1
   📋 Research report: No
💾 Results saved to: notebooks/deep_search_results_20250811_232422.json
results=[{'url': 'deep-research://report', 'title': 'Deep Research Report', 'content': "# Research Report: Basin-Scale Water Budgets and Groundwater's Role in Streamflow\n\n## Introduction\nUnderstanding the dynamics of water budgets at the basin scale is crucial for effective water resource management. Groundwater plays a significant role in maintaining streamflow, especially during dry periods, acting as a stable source of water. This report synthesizes research findings on the role of groundwater in basin-scale water budgets, focusing on methodologies, case studies, statistical models, and conflicting viewpoints.\n\n## Methodologies for Assessing Groundwater's Contribution to Streamflow\nSeveral methodologies have been developed to quantify groundwater's contribution to streamflow. These include:\n- **Groundwater-Centric Methods

# 4. Results Analysis

In [11]:
# Display Research Report (if generated)
if 'output' in globals() and output.results:
    # Check for research report
    report = None
    if output.results[0].get("url") == "deep-research://report":
        report = output.results[0]
    
    if report:
        print("📋 RESEARCH SYNTHESIS REPORT")
        print("=" * 80)
        content = report.get("content", "No content available")
        print(content)
        print("=" * 80)
    else:
        print("📋 No research synthesis report generated")
        
else:
    print("⚠️  No results available. Run the search cell first!")

📋 RESEARCH SYNTHESIS REPORT
# Research Report: Basin-Scale Water Budgets and Groundwater's Role in Streamflow

## Introduction
Understanding the dynamics of water budgets at the basin scale is crucial for effective water resource management. Groundwater plays a significant role in maintaining streamflow, especially during dry periods, acting as a stable source of water. This report synthesizes research findings on the role of groundwater in basin-scale water budgets, focusing on methodologies, case studies, statistical models, and conflicting viewpoints.

## Methodologies for Assessing Groundwater's Contribution to Streamflow
Several methodologies have been developed to quantify groundwater's contribution to streamflow. These include:
- **Groundwater-Centric Methods**: These methods focus on direct measurements and modeling of groundwater flow and its interaction with surface water. For instance, the study by Mohan et al. (2023) presents methods for estimating groundwater contributions

In [12]:
# Display Search Results Summary
if 'output' in globals() and output.results:
    search_results = [r for r in output.results if r.get("url") != "deep-research://report"]
    
    if search_results:
        print(f"🔍 SEARCH RESULTS SUMMARY ({len(search_results)} papers)")
        print("=" * 80)
        
        # Calculate quality metrics
        scores = [r.get("relevancy_score") for r in search_results if isinstance(r.get("relevancy_score"), (int, float))]
        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"📈 Quality Metrics: Avg={avg_score:.2f}, Min={min(scores):.2f}, Max={max(scores):.2f}")
            print("-" * 40)
        
        # Display results (limited by MAX_RESULTS_TO_DISPLAY)
        display_count = len(search_results) if MAX_RESULTS_TO_DISPLAY is None else min(MAX_RESULTS_TO_DISPLAY, len(search_results))
        
        for i, result in enumerate(search_results[:display_count]):
            print(f"\n📄 [{i+1}] {result.get('title', 'Untitled')}")
            
            # Show relevancy score
            score = result.get('relevancy_score')
            if score is not None:
                print(f"    📊 Relevancy: {score:.2f}")
            
            # Show author if available
            author = result.get('author')
            if author:
                print(f"    👤 Author: {author}")
                
            # Show URL
            url = result.get('url', '')
            if url:
                print(f"    🔗 {url}")
            
            # Show summary if available
            summary = result.get('summary', '')
            if summary:
                summary_preview = summary[:200] + "..." if len(summary) > 200 else summary
                print(f"    📝 Summary: {summary_preview}")
            
            print("-" * 40)
            
        if MAX_RESULTS_TO_DISPLAY and len(search_results) > MAX_RESULTS_TO_DISPLAY:
            print(f"\n... and {len(search_results) - MAX_RESULTS_TO_DISPLAY} more results")
            
    else:
        print("📋 No search results found")
        
else:
    print("⚠️  No results available. Run the search cell first!")

🔍 SEARCH RESULTS SUMMARY (34 papers)
📈 Quality Metrics: Avg=0.98, Min=0.35, Max=1.00
----------------------------------------

📄 [1] Estimating Basin‐Scale Water Budgets With SMAP Soil ...
    📊 Relevancy: 1.00
    🔗 https://agupubs.onlinelibrary.wiley.com/doi/full/10.1029/2018WR022669
----------------------------------------

📄 [2] Groundwater dynamics beneath a marine ice sheet
    📊 Relevancy: 1.00
    🔗 http://arxiv.org/abs/2409.11848v1
----------------------------------------

📄 [3] Quantifying Groundwater's Contribution to Regional ...
    📊 Relevancy: 1.00
    🔗 https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2022WR033153
----------------------------------------

📄 [4] Surrogate Optimization of Deep Neural Networks for Groundwater Predictions
    📊 Relevancy: 1.00
    🔗 http://arxiv.org/abs/1908.10947v3
----------------------------------------

📄 [5] A groundwater market model
    📊 Relevancy: 1.00
    🔗 http://arxiv.org/abs/2501.14071v1
-------------------------------------

# 5. SearxNG Direct Testing

Test the underlying SearxNG search tool with the same configuration used by Deep Search.

In [13]:
# Run SearxNG Direct Test
from akd.tools.search.searxng_search import SearxNGSearchTool, SearxNGSearchToolConfig, SearxNGSearchToolInputSchema

print("🔍 Running SearxNG search...")

# Configure and run SearxNG
searxng_config = SearxNGSearchToolConfig(
    max_results=SEARXNG_MAX_RESULTS,
    engines=SEARXNG_ENGINES,
    max_pages=SEARXNG_MAX_PAGES,
    results_per_page=SEARXNG_RESULTS_PER_PAGE,
    score_cutoff=SEARXNG_SCORE_CUTOFF,
    debug=DEBUG_MODE
)

searxng_tool = SearxNGSearchTool(config=searxng_config)
searxng_input = SearxNGSearchToolInputSchema(queries=[USER_QUERY], category="science", max_results=SEARXNG_MAX_RESULTS)

searxng_output = await searxng_tool.arun(searxng_input)

print(f"✅ Found {len(searxng_output.results)} results")

# Display results
for i, result in enumerate(searxng_output.results, 1):
    print(f"\n{i}. {result.title}")
    print(f"   {result.url}")
    if hasattr(result, 'content') and result.content:
        content_preview = result.content[:150] + "..." if len(result.content) > 150 else result.content
        print(f"   {content_preview}")
    print("-" * 60)

[32m2025-08-11 23:24:22.565[0m | [34m[1mDEBUG   [0m | [36makd._base[0m:[36marun[0m:[36m231[0m - [34m[1mRunning SearxNGSearchTool with params: queries=['Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.'] category='science' max_results=50[0m
[32m2025-08-11 23:24:22.569[0m | [1mINFO    [0m | [36makd.tools.search.searxng_search[0m:[36m_arun[0m:[36m293[0m - [1m🔍 SearxNG SEARCH QUERIES (1 total):[0m
[32m2025-08-11 23:24:22.570[0m | [1mINFO    [0m | [36makd.tools.search.searxng_search[0m:[36m_arun[0m:[36m295[0m - [1m  1. 'Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.'[0m
[32m2025-08-11 23:24:22.571[0m | [1mINFO    [0m | [36makd.tools.search.searxng_search[0m:[36m_arun[0m:[36m296[0m - [1m🎯 Target results per query: 50[0m
[32m2025-08-11 23:24:22.571[0m | [1mINFO    [0

🔍 Running SearxNG search...


[32m2025-08-11 23:24:26.839[0m | [34m[1mDEBUG   [0m | [36makd.tools.search.searxng_search[0m:[36m_fetch_search_results_paginated[0m:[36m237[0m - [34m[1mFetched 20 results for page 1[0m
[32m2025-08-11 23:24:26.941[0m | [34m[1mDEBUG   [0m | [36makd.tools.search.searxng_search[0m:[36m_fetch_search_results_paginated[0m:[36m228[0m - [34m[1mFetching page 2 for query: Find research papers on basin-scale water budgets, focusing on the role of groundwater as a stable and important source for streamflow.[0m
[32m2025-08-11 23:24:31.432[0m | [34m[1mDEBUG   [0m | [36makd.tools.search.searxng_search[0m:[36m_fetch_search_results_paginated[0m:[36m237[0m - [34m[1mFetched 20 results for page 2[0m
[32m2025-08-11 23:24:31.534[0m | [34m[1mDEBUG   [0m | [36makd.tools.search.searxng_search[0m:[36m_fetch_search_results_paginated[0m:[36m228[0m - [34m[1mFetching page 3 for query: Find research papers on basin-scale water budgets, focusing on the role of gro

✅ Found 24 results

1. Quantifying basin-scale changes in groundwater storage ...
   https://www.sciencedirect.com/science/article/pii/S2352801X2300053X
   by SR Rusli · 2023 · Cited by 22 — In this study, we quantify the groundwater storage changes by using the Wflow_sbm hydrological model coupled with t...
------------------------------------------------------------

2. The Relationship Between Groundwater Nitrate Pollution and Crime in United States: Nitrate-Crime Hypothesis
   http://arxiv.org/abs/2306.09354v1
   Groundwater is a crucial source of drinking water, but it is often contaminated with water-soluble pollutants that can pose significant health risks. ...
------------------------------------------------------------

3. The groundwater budget: A tool for preliminary estimation ...
   https://www.researchgate.net/publication/320667171_The_groundwater_budget_A_tool_for_preliminary_estimation_of_the_hydraulic_connection_between_neighboring_aquifers
   Estimating a Reliable Wat