In [1]:
import instructor
import openai
from akd.tools.search import (
    SearxNGSearchTool,
    SearxNGSearchToolConfig,
)
from akd.tools.scrapers.web_scrapers import (
    SimpleWebScraper,
    WebpageScraperToolConfig,
    Crawl4AIWebScraper
)
from akd.tools.scrapers.pdf_scrapers import (
    SimplePDFScraper,
)
from akd.tools.scrapers.composite import CompositeWebScraper
from akd.tools.scrapers.resolvers import (
    ArxivResolver,
    ADSResolver,
    IdentityResolver
)

from akd.tools.scrapers.composite import ResearchArticleResolver
from akd.agents.factory import create_query_agent

from akd.agents.extraction import (
    IntentBasedExtractionSchemaMapper, 
    EstimationExtractionAgent,
)
from akd.agents.litsearch import LitAgent, LitAgentInputSchema
from akd.agents.intents import Intent, IntentInputSchema, IntentAgent
from akd.agents._base import BaseAgentConfig

[32m2025-04-22 10:56:41.443[0m | [1mINFO    [0m | [36makd.config[0m:[36mget_config[0m:[36m78[0m - [1mLoading configuration for environment: LOCAL[0m


In [2]:
SEARCH_TOOL = SearxNGSearchTool(
    config=SearxNGSearchToolConfig(
        base_url="http://localhost:8080",
        max_results=5,
        engines=["google", "arxiv", "google_scholar"],
        debug=True
    )
)
scraper_cfg = WebpageScraperToolConfig()
SCRAPER = CompositeWebScraper(
    SimpleWebScraper(scraper_cfg),
    Crawl4AIWebScraper(scraper_cfg),
    SimplePDFScraper(scraper_cfg),
    debug=True
)

ARTICLE_RESOLVER = ResearchArticleResolver(
    ArxivResolver(),
    ADSResolver(),
    IdentityResolver()
)
scraper_cfg = WebpageScraperToolConfig()
SCRAPER = CompositeWebScraper(
    SimpleWebScraper(scraper_cfg),
    Crawl4AIWebScraper(scraper_cfg),
    SimplePDFScraper(scraper_cfg),
    debug=True
)


INTENT_AGENT = IntentAgent(
    config=BaseAgentConfig(
        client=instructor.from_openai(openai.AsyncOpenAI())
    )
)
QUERY_AGENT = create_query_agent()
SCHEMA_MAPPER = IntentBasedExtractionSchemaMapper()
EXTRACTION_AGENT = EstimationExtractionAgent()

lit_agent = LitAgent(
    intent_agent=INTENT_AGENT,
    schema_mapper=IntentBasedExtractionSchemaMapper(),
    query_agent=QUERY_AGENT,
    extraction_agent=EXTRACTION_AGENT,
    search_tool=SEARCH_TOOL,
    web_scraper=SCRAPER,
    article_resolver=ARTICLE_RESOLVER,
)
lit_agent.clear_history()

query = "methods and estimation to map landslides in Nepal"
result = await lit_agent.arun(
    LitAgentInputSchema(query=query)
)

[32m2025-04-22 10:56:41.794[0m | [1mINFO    [0m | [36makd.agents.litsearch[0m:[36marun[0m:[36m83[0m - [1mAnalyzing input query to generate relevant search queries...[0m
[32m2025-04-22 10:56:42.625[0m | [34m[1mDEBUG   [0m | [36makd.agents.litsearch[0m:[36marun[0m:[36m88[0m - [34m[1mGenerated search queries:[0m
[32m2025-04-22 10:56:42.626[0m | [34m[1mDEBUG   [0m | [36makd.agents.litsearch[0m:[36marun[0m:[36m90[0m - [34m[1mQuery 1: methods and estimation to map landslides in Nepal[0m
[32m2025-04-22 10:56:42.626[0m | [34m[1mDEBUG   [0m | [36makd.agents.litsearch[0m:[36marun[0m:[36m90[0m - [34m[1mQuery 2: methods for landslide mapping in Nepal[0m
[32m2025-04-22 10:56:42.627[0m | [34m[1mDEBUG   [0m | [36makd.agents.litsearch[0m:[36marun[0m:[36m90[0m - [34m[1mQuery 3: landslide estimation techniques Nepal[0m
[32m2025-04-22 10:56:42.627[0m | [34m[1mDEBUG   [0m | [36makd.agents.litsearch[0m:[36marun[0m:[36m90[0m - [

In [3]:
result[0].model_dump()

{'source': 'https://link.springer.com/article/10.1007/s12665-021-09650-2',
 'result': {'estimations': [{'answer': 'The artificial neural network approach yielded the best prediction capability for landslide susceptibility mapping in the high mountain area of Nepal, with an AUC value of 96.9%.',
    'related_knowledge': ['Landslide susceptibility mapping',
     'Artificial neural network',
     'High mountain regions'],
    'research_data': {'data_format': 'Satellite images',
     'origin': 'Indrawati watershed, Central Nepal',
     'data_url': None},
    'methodology': 'Landslide susceptibility mapping using four approaches: frequency ratio, logistic regression, artificial neural network, and support vector machine.',
    'assumptions': ['The landslides were randomly split into a ratio of 80:20 for training and validating the susceptibility maps.'],
    'confidence_level': 96.9,
    'validation_method': 'Validated using area under curve (AUC), kappa index, and statistical inferences.'}

In [4]:
import json 

with open("test.json", "w") as f: 
   f.write(json.dumps([r.model_dump(mode="json") for r in result]))


In [5]:
# first eval only on answers
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import GEval
from deepeval import evaluate

In [6]:
test_cases = [
    LLMTestCase(input=query, actual_output=" - ".join([e.answer for e in r.result.estimations]))
    for r in result
]

dataset = EvaluationDataset(test_cases=test_cases)

In [7]:
from deepeval.metrics import AnswerRelevancyMetric
answer_relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4.1",
    include_reason=True
)

In [8]:
evaluate(dataset, [answer_relevancy_metric])

Evaluating 5 test case(s) in parallel: |█████████████████|100% (5/5) [Time Taken: 00:10,  2.04s/test case]




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the output was fully relevant and directly addressed the input without any irrelevant information. Great job staying focused and precise!, error: None)

For test case:

  - input: methods and estimation to map landslides in Nepal
  - actual output: The artificial neural network approach yielded the best prediction capability for landslide susceptibility mapping in the high mountain area of Nepal, with an AUC value of 96.9%.
  - expected output: None
  - context: None
  - retrieval context: None


Metrics Summary

  - ❌ Answer Relevancy (score: 0.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 0.00 because the actual output did not address the input regarding methods or estimation for mapping landslides in Nepal at all., error: None)

For test case:

  - input: methods and estimation to map landslides in Nepa

EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.7, success=True, score=1.0, reason='The score is 1.00 because the output was fully relevant and directly addressed the input without any irrelevant information. Great job staying focused and precise!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.0, verbose_logs='Statements:\n[\n    "The artificial neural network approach yielded the best prediction capability for landslide susceptibility mapping.",\n    "The study was conducted in the high mountain area of Nepal.",\n    "The artificial neural network approach achieved an AUC value of 96.9%."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "idk",\n        "reason": null\n    },\n    {\n        "verdict": "idk",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='methods a

In [9]:
from deepeval.metrics import FaithfulnessMetric
# Measures how faithful the answer is to the given retrieved context
faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4.1",
    include_reason=True
)

In [10]:
# Evaluation metric: 
geval_metric = GEval(
    name="Correctness",
    criteria="Correctness - determine if the actual output is a correct and helpful answer to the given query",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.INPUT],
    strict_mode=True
)
evaluate(dataset, [geval_metric])

Evaluating 5 test case(s) in parallel: |█████████████████|100% (5/5) [Time Taken: 00:04,  1.17test case/s]




Metrics Summary

  - ❌ Correctness (GEval) (score: 0.0, threshold: 1.0, strict: True, evaluation model: gpt-4o, reason: The output mentions a specific dataset related to Gorkha, Nepal but does not address methods and estimation for mapping landslides in Nepal generally, missing a comprehensive and relevant explanation., error: None)

For test case:

  - input: methods and estimation to map landslides in Nepal
  - actual output: The Large-scale Multi-source High-resolution Landslide Dataset (LMHLD) includes remote sensing images from five different satellite sensors across seven study areas, including Gorkha, Nepal (2015).
  - expected output: None
  - context: None
  - retrieval context: None


Metrics Summary

  - ❌ Correctness (GEval) (score: 0.0, threshold: 1.0, strict: True, evaluation model: gpt-4o, reason: The output 'Answer not found' does not address the query about 'methods and estimation to map landslides in Nepal'., error: None)

For test case:

  - input: methods and esti

EvaluationResult(test_results=[TestResult(name='test_case_3', success=False, metrics_data=[MetricData(name='Correctness (GEval)', threshold=1.0, success=False, score=0.0, reason='The output mentions a specific dataset related to Gorkha, Nepal but does not address methods and estimation for mapping landslides in Nepal generally, missing a comprehensive and relevant explanation.', strict_mode=True, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0025174999999999998, verbose_logs='Criteria:\nCorrectness - determine if the actual output is a correct and helpful answer to the given query \n \nEvaluation Steps:\n[\n    "Verify if the actual output directly addresses the query presented in the input.",\n    "Check the accuracy of the information provided in the actual output against the known facts or sources.",\n    "Determine if the actual output provides a clear and comprehensive explanation or solution relevant to the query.",\n    "Assess if the output includes any errors or irr

In [11]:
full_test_cases = [
    LLMTestCase(input=query, actual_output=str(r.model_dump()))
    for r in result
]

full_dataset = EvaluationDataset(test_cases=full_test_cases)

updated_geval_metric = GEval(
    name="Correctness",
    criteria="Correctness - determine if the actual output represents a thorough and accurate literature review in response to the given query. An output is considered correct is answer is relevant to the user's query, adequate reasoning is provided, and if sources are likely to be relevant and helfpul to the user's research question",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.INPUT],
    strict_mode=True
)
evaluate(full_dataset, [updated_geval_metric])

Evaluating 5 test case(s) in parallel: |█████████████████|100% (5/5) [Time Taken: 00:05,  1.09s/test case]




Metrics Summary

  - ✅ Correctness (GEval) (score: 1.0, threshold: 1.0, strict: True, evaluation model: gpt-4o, reason: The Actual Output addresses the research question by discussing the Landslide Dataset for Nepal, includes comprehensive details on methodology and validation for mapping landslides, and ensures the dataset's reliability for deep learning models., error: None)

For test case:

  - input: methods and estimation to map landslides in Nepal
  - actual output: {'source': 'https://arxiv.org/pdf/2502.19866v1.pdf', 'result': {'estimations': [{'answer': 'The Large-scale Multi-source High-resolution Landslide Dataset (LMHLD) includes remote sensing images from five different satellite sensors across seven study areas, including Gorkha, Nepal (2015).', 'related_knowledge': ['LMHLD is designed for landslide detection using deep learning.', 'The dataset includes 25,365 patches of various sizes for different landslide scales.'], 'research_data': {'data_format': 'HDF5', 'origin': '

EvaluationResult(test_results=[TestResult(name='test_case_3', success=True, metrics_data=[MetricData(name='Correctness (GEval)', threshold=1.0, success=True, score=1.0, reason="The Actual Output addresses the research question by discussing the Landslide Dataset for Nepal, includes comprehensive details on methodology and validation for mapping landslides, and ensures the dataset's reliability for deep learning models.", strict_mode=True, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0032624999999999998, verbose_logs='Criteria:\nCorrectness - determine if the actual output represents a thorough and accurate literature review in response to the given query. An output is considered correct is answer is relevant to the user\'s query, adequate reasoning is provided, and if sources are likely to be relevant and helfpul to the user\'s research question \n \nEvaluation Steps:\n[\n    "Compare the Actual Output to the Input query to assess if the literature review addresses the spec

In [12]:
updated_geval_metric = GEval(
    name="Correctness",
    criteria="Correctness - determine if the actual output represents a thorough and accurate literature review in response to the given query. An output is considered correct is answer is relevant to the user's query, adequate reasoning is provided, and if sources are likely to be relevant and helfpul to the user's research question. Note that any geographic information in the user's query is EXTREMELY important, therefore any source that does not explicitely reference a geography in the user's query should be considered incorrect",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.INPUT],
    strict_mode=True
)
evaluate(full_dataset, [updated_geval_metric])

Evaluating 5 test case(s) in parallel: |█████████████████|100% (5/5) [Time Taken: 00:04,  1.10test case/s]




Metrics Summary

  - ❌ Correctness (GEval) (score: 0.0, threshold: 1.0, strict: True, evaluation model: gpt-4o, reason: The source provided does not explicitly reference geographic information about Nepal. Additionally, confidence level is mentioned as None, lacking comprehensive analysis., error: None)

For test case:

  - input: methods and estimation to map landslides in Nepal
  - actual output: {'source': 'https://hdl.handle.net/20.500.14017/8df80ddb-1fa4-4abd-929e-79249a9b579a', 'result': {'estimations': [{'answer': 'About 30% of the area in the Tinau watershed is highly susceptible to landsliding.', 'related_knowledge': ['Landslide susceptibility mapping is crucial for disaster management in Nepal.', 'The weight of evidence method is used for mapping landslide susceptibility.'], 'research_data': {'data_format': 'Field observations and geological data', 'origin': 'Tinau watershed, west Nepal', 'data_url': None}, 'methodology': 'Weight of evidence method applied to prepare landsl

EvaluationResult(test_results=[TestResult(name='test_case_4', success=False, metrics_data=[MetricData(name='Correctness (GEval)', threshold=1.0, success=False, score=0.0, reason='The source provided does not explicitly reference geographic information about Nepal. Additionally, confidence level is mentioned as None, lacking comprehensive analysis.', strict_mode=True, evaluation_model='gpt-4o', error=None, evaluation_cost=0.00292, verbose_logs='Criteria:\nCorrectness - determine if the actual output represents a thorough and accurate literature review in response to the given query. An output is considered correct is answer is relevant to the user\'s query, adequate reasoning is provided, and if sources are likely to be relevant and helfpul to the user\'s research question. Note that any geographic information in the user\'s query is EXTREMELY important, therefore any source that does not explicitely reference a geography in the user\'s query should be considered incorrect \n \nEvaluati

In [13]:
# creating synthetic golden sets
# using `expected_output`