## Prompt Full Markdown

### Extract Relevant Entities from Markdown

In [19]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

model_name = "llama3.2:latest"
model = ChatOllama(model=model_name, temperature=0.0, max_tokens=512)

pathogens = ["Campylobacter", "Cryptosporidium", "Giardia", "Rotavirus"]
geo_areas = ["Ghana", "Tanzania", "Italy", "Romania"]
variables = ["Rainfall","Relative humidity","Soil moisture","Solar radiation","Surface pressure","Surface runoff","Temperature","Wind speed"]

markdown = open("outputs/paper_text.md", "r").read()

prompt = ChatPromptTemplate.from_template(
    """
    You are an assistant scientist gathering occurrences of pathogens and how they relate with a specific set of variables.
    You are also interested in the geographical areas where these relationships occur. 
    Your job is to cite a list of passages from the provided markdown text that contain information about pathogens, meteorological variables, and geographical areas
 
    Document Markdown:
    {markdown}

    Places of Interest:
    {geo_areas}
    
    Relevant Pathogens:
    {pathogens}

    Metheorological Variables:
    {variables}

    Answer in JSON format
    """
)
chain = prompt | model

In [20]:
answer = chain.invoke({"markdown": markdown, "geo_areas": geo_areas, "pathogens": pathogens, "variables": variables})
answer

AIMessage(content='Here is the answer in JSON format:\n\n```\n{\n  "Places of Interest": [\n    "Ghana",\n    "Tanzania",\n    "Italy",\n    "Romania"\n  ],\n  "Relevant Pathogens": [\n    "Campylobacter",\n    "Cryptosporidium",\n    "Giardia",\n    "Rotavirus"\n  ],\n  "Metheorological Variables": [\n    "Rainfall",\n    "Relative humidity",\n    "Soil moisture",\n    "Solar radiation",\n    "Surface pressure",\n    "Surface runoff",\n    "Temperature",\n    "Wind speed"\n  ]\n}\n```\n\nNote: I corrected the spelling of "Metheorological" to "Meteorological".', additional_kwargs={}, response_metadata={'model': 'llama3.2:latest', 'created_at': '2025-06-04T12:48:35.220572Z', 'done': True, 'done_reason': 'stop', 'total_duration': 20869572208, 'load_duration': 27415875, 'prompt_eval_count': 4096, 'prompt_eval_duration': 13662334459, 'eval_count': 149, 'eval_duration': 7178879375, 'model_name': 'llama3.2:latest'}, id='run--0db52564-cd20-4899-892c-8999fcca6ace-0', usage_metadata={'input_tok

In [18]:
from IPython.display import Markdown, display
display(Markdown(answer.text()))

Here is the answer in JSON format:

```
{
  "Places of Interest": [
    "Ghana",
    "Tanzania",
    "Italy",
    "Romania"
  ],
  "Relevant Pathogens": [
    "Campylobacter",
    "Cryptosporidium",
    "Giardia",
    "Rotavirus"
  ],
  "Metheorological Variables": [
    "Rainfall",
    "Relative humidity",
    "Soil moisture",
    "Solar radiation",
    "Surface pressure",
    "Surface runoff",
    "Temperature",
    "Wind speed"
  ]
}
```

Note: I corrected the spelling of "Metheorological" to "Meteorological".

In [28]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

pathogen_schema = ResponseSchema(name="relation_pathogen",
                             description="A pathogen that is studied in this paper.\
                             the pathogen belongs to the list of pathogens provided.\
                             Provide a relevant summary, a citation and where in the text can I find the information")

geo_schema = ResponseSchema(name="relation_geo",
                             description="A geographical area that is studied in this paper.\
                             the geographical area belongs to the list of geographical areas provided.\
                             Provide a relevant summary, a citation and where in the text can I find the information")

variable_schema = ResponseSchema(name="relation_variable",
                                description="A meteorological variable that is studied in this paper.\
                                the variable belongs to the list of meteorological variables provided.\
                                Provide a relevant summary, a citation and where in the text can I find the information")

output_parser = StructuredOutputParser.from_response_schemas([pathogen_schema, geo_schema, variable_schema])

format_instructions = output_parser.get_format_instructions()
print(format_instructions)

template_json = """\
    You are an assistant scientist gathering occurrences of pathogens and how they relate with a specific set of variables.
    You are also interested in the geographical areas where these relationships occur and how they compare to each other. 
    Your job is to cite a list of passages from the provided markdown text that contain information about pathogens, meteorological variables, and geographical areas

    Places of Interest:
    {geo_areas}
    
    Relevant Pathogens:
    {pathogens}

    Metheorological Variables:
    {variables}

Document Markdown:
    {markdown}

{format_instructions}
"""

prompt = ChatPromptTemplate.from_template(template=template_json)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"relation_pathogen": string  // A pathogen that is studied in this paper.                             the pathogen belongs to the list of pathogens provided.                             Provide a relevant summary, a citation and where in the text can I find the information
	"relation_geo": string  // A geographical area that is studied in this paper.                             the geographical area belongs to the list of geographical areas provided.                             Provide a relevant summary, a citation and where in the text can I find the information
	"relation_variable": string  // A meteorological variable that is studied in this paper.                                the variable belongs to the list of meteorological variables provided.                                Provide a relevant summary, a citation and where in the te

In [27]:
chain = prompt | model | output_parser
answer = chain.invoke({
    "markdown": markdown,
    "geo_areas": geo_areas,
    "pathogens": pathogens,
    "variables": variables,
    "format_instructions": format_instructions})

In [26]:
answer

{'relation_pathogen': 'Rotavirus',
 'relation_geo': 'Asia',
 'relation_variable': 'Temperature'}