# Embeddings Module

## Install necessary modules

In [1]:
%pip install -U -e ..
%pip install -e ..

Obtaining file:///Users/jose/Repos/MINE-DD
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: minedd
  Building editable for minedd (pyproject.toml) ... [?25ldone
[?25h  Created wheel for minedd: filename=minedd-0.1.0-0.editable-py3-none-any.whl size=14049 sha256=ec77d42c0b2716bd9b61d747cdb5f9f61eb3a779ec248b4f30f86341124803e1
  Stored in directory: /private/var/folders/79/zf67ls7520x9m4mj7nx6q07w0000gp/T/pip-ephem-wheel-cache-wzj4tt3e/wheels/c7/bf/1e/e7790fb2ba7cdeeb0fd2dc5eb82103f3cd4a31d3c897656e15
Successfully built minedd
Installing collected packages: minedd
  Attempting uninstall: minedd
    Found existing installation: minedd 0.1.0
    Uninstalling minedd-0.1.0:
      Successfully uninstalled minedd-0.1.0
Successfully installed minedd-0.1.0


In [2]:
# This is needed to run asyncio code in Jupyter notebooks
# without getting "RuntimeError: This event loop is already running"
import nest_asyncio
nest_asyncio.apply()

### Load Embeddings object

In [3]:
from minedd.embeddings import Embeddings
from pathlib import Path

MODEL = "ollama/llama3.2"
EMBEDDING = "ollama/mxbai-embed-large:latest"
PAPERS_DIRECTORY = Path.home() / "papers_minedd/"
EMBEDDINGS_DIR = "outputs/minedd-embeddings.pkl"

embeddings = Embeddings(
    model=MODEL,
    embedding_model=EMBEDDING,
    paper_directory=PAPERS_DIRECTORY,
    output_embeddings_path=EMBEDDINGS_DIR,
)
embeddings

Embeddings(output_embeddings_path=outputs/minedd-embeddings.pkl, total_docs=0, total_chunks=0)

## Load papers for processing

In [4]:
pdf_file_list = embeddings.prepare_papers()
print(len(pdf_file_list))
# Check the first 10 files
pdf_file_list[:10]

14


['Exploring Campylobacter seasonality across Europe using TESSy.pdf',
 'Towards stormwater reuse risk management plans.pdf',
 'Seasonality of Rotavirus Hospitalizations at Costa Rica_s National Children_s Hospital in 2010_2015.pdf',
 'Occurrence of Giardia and Crypto in Italian water supplies.pdf',
 'Use of earth observation-derived hydrometeorological.pdf',
 'Seasonality of rotavirus disease in the tropics_ a systematic review and meta-analysis.pdf',
 'Prev and climate associated factors of Crypto in chimps in TZ.pdf',
 'Prev of Crypto Entamoeba and Giardia in Children Dar TZ.pdf',
 'Associations between 8 EO climate variables.pdf',
 'Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf']

## Create/Load Embeddings and save in PKL

In [5]:
import os

if os.path.exists(EMBEDDINGS_DIR):
    embeddings.load_existing_embeddings(EMBEDDINGS_DIR)
else:
    embeddings.process_papers(pdf_file_list)
    print(f"Embeddings created and saved to {EMBEDDINGS_DIR}")

embeddings

Loaded existing embeddings from outputs/minedd-embeddings.pkl


Embeddings(output_embeddings_path=outputs/minedd-embeddings.pkl, total_docs=13, total_chunks=479)

## Inspect Documents Object

In [6]:
detail_df = embeddings.get_docs_details()
detail_df

Unnamed: 0,doc_key,key,docname,title,authors,year,journal,volume,pages,doi,url,file_location,citation_count,source_quality
0,f34320735c8ec25ec3feecb48df9cdd5,,Exploring Campylobacter seasonality across Eur...,,,,,,,,,,,
1,52c9d7954327b63719a57f7d581e526a,,Towards stormwater reuse risk management plans,,,,,,,,,,,
2,a3dec01f1c332633,vernile2009occurrenceofgiardia,vernile2009occurrenceofgiardia,Occurrence of Giardia and Cryptosporidium in I...,"[A. Vernile, Ari Q. Nabi, L. Bonadonna, R. Bri...",2009.0,Environmental Monitoring and Assessment,152,203-207,10.1007/s10661-008-0308-4,,,20.0,1.0
3,8a437a875e104aab196929290606624c,,Use of earth observation derived hydrometeorol...,,,,,,,,,,,
4,f57b64122eb55199,eisenberg2009seasonalityofrotavirus,eisenberg2009seasonalityofrotavirus,Seasonality of rotavirus disease in the tropic...,"[Levy, Karen, Alan E Hubbard, and Joseph NS Ei...",2009.0,International journal of epidemiology,38 6,\n1487-96\n,10.1093/ije/dyn260,https://academic.oup.com/ije/article-pdf/38/6/...,,272.0,3.0
5,c83e068a47d40950,al.2012prevalenceandclimatic,al.2012prevalenceandclimatic,Prevalence and climatic associated factors of ...,"[Gomez, M., et al., Hermes, G., Hope, C. J., e...",2012.0,Parasitology Research,112,393-399,10.1007/s00436-012-3147-8,,,16.0,1.0
6,1d08696ef0a98aa7d0d51d2228f13d1c,,Prev of Crypto Entamoeba and Giardia in Childr...,,,,,,,,,,,
7,bdbc4791aefb4d144ffa14eb11be06d6,colstonUnknownyearassociationsbetweeneight,Associations between 8 EO climate variables,Associations Between Eight Earth Observation-D...,"[Josh M. Colston, Benjamin F. Zaitchik, Hamada...",,,,,,,,,
8,c127a43a8472a75f908ffbb2617c1af2,s.Unknownyearunknowntitle,Seasonality of Rotavirus in South Asia A Meta ...,,"[Jagai, J. S., Sarkar, R., Castronovo, D., Kat...",,,,,10.1289/ehp1330101,,,,
9,3083109e54b5e105622a0a7eb0858215,,Masciopinto et al. 2019 Human health risk ente...,,,,,,,,,,,


In [7]:
n_chunks = len(embeddings.docs.texts)
for doc in embeddings.docs.texts:
    emb_dims = len(doc.embedding)
    print(len(doc.text))
    print(doc.doc.docname)
    break

print(f"Embeddings Dimsenions [{n_chunks}, {emb_dims}]")

2500
Exploring Campylobacter seasonality across Europe using TESSy
Embeddings Dimsenions [479, 1024]


## Query Specific Doc

In [8]:
document_name = "al.2018microbiologicalevaluationof"

chunks = embeddings.get_document_chunks(docname=document_name)
print(len(chunks))


chunks = embeddings.get_document_chunks(docname=document_name, pages=[1,2])
print(len(chunks))

chunks = embeddings.get_document_chunks(docname=document_name, pages=[1000, "-"])
print(len(chunks))

chunks = embeddings.get_document_chunks(docname=document_name, pages=[6])
for ch in chunks:
    print(f"---- Pages {ch.pages} ----")
    [print("> ",line) for line in ch.text.split("\n")]
    print("\n")

Document with key al.2018microbiologicalevaluationof not found in the embeddings.
0
Document with key al.2018microbiologicalevaluationof not found in the embeddings.
0
Document with key al.2018microbiologicalevaluationof not found in the embeddings.
0
Document with key al.2018microbiologicalevaluationof not found in the embeddings.


# Query Module

In [9]:
from paperqa.settings import Settings, AgentSettings, ParsingSettings

local_llm_config = {
    "model_list": [
        {
            "model_name": MODEL,
            "litellm_params": {
                "model": MODEL,
                # Uncomment if using a local server
                # "api_base": "http://0.0.0.0:11434",
            },
            "answer": {
                "evidence_k": 10,
                "evidence_detailed_citations": True,
                "evidence_summary_length": "about 100 words",
                "answer_max_sources": 5,
                "answer_length": "about 300 words, but can be longer",
                "max_concurrent_requests": 10,
                "answer_filter_extra_background": False
            },
            "parsing": {
                "use_doc_details": True
            },
            "prompts" : {"use_json": False}
        }
    ]
}

query_settings = Settings(
    llm=MODEL,
    llm_config=local_llm_config,
    summary_llm=MODEL,
    summary_llm_config=local_llm_config,
    paper_directory=PAPERS_DIRECTORY,
    embedding=EMBEDDING,
    agent=AgentSettings(
        agent_llm=MODEL,
        agent_llm_config=local_llm_config,
        return_paper_metadata=True
    ),
    parsing=ParsingSettings(
        chunk_size=2500,
        overlap=250
    ),
    prompts={"use_json": False}
)

In [10]:
from minedd.query import Query
engine = Query(
    model=MODEL,
    paper_directory=PAPERS_DIRECTORY,
    output_dir='outputs/',
)

engine.settings = query_settings

engine.load_embeddings(EMBEDDINGS_DIR)
engine.docs



In [11]:
question = "What is the relationship between rotavirus and geographical regions?"

result = engine.query_single(question, max_retries=3)
print("\n=== Question ===")
print(result['question'])
print("\n=== Answer ===")
print(result['answer'])
print("\n=== Sources ===")
for i, citation in enumerate(result['citations']):
    print(f"{i+1}. {citation}")
if result['urls'] and len(result['urls']) > 0:
    print("\n=== URLs ===")
    for i, url in enumerate(result['urls']):
        print(f"{i+1}. {url}")


Failed to generate bibtex for Global Seasonality of Rotavirus Disease
Failed to generate bibtex for Global Seasonality of Rotavirus Disease
Failed to parse bibtex for @article{Are hospitalizations for rotavirus gastroenteritis associated with meteorologic factors ,
    author = "Hervás, D. and Hervás-Masip, J. and Rosell, A. and Mena, A. and Pérez, J. L. and Hervás, J. A.",
    title = "Are hospitalizations for rotavirus gastroenteritis associated with meteorologic factors?",
    year = "2014",
    journal = "European Journal of Clinical Microbiology \& Infectious Diseases",
    volume = "33",
    pages = "1547-1553",
    month = "Apr",
    doi = "10.1007/s10096-014-2106-y",
    url = "https://doi.org/10.1007/s10096-014-2106-y",
    publisher = "Springer Science and Business Media LLC",
    issue = "9",
    issn = "0934-9723"
}
.
Failed to generate bibtex for a.2014arehospitalizationsfor
Failed to generate bibtex for Global Seasonality of Rotavirus Disease
Failed to parse bibtex for @a


=== Question ===
What is the relationship between rotavirus and geographical regions?

=== Answer ===
Question: What is the relationship between rotavirus and geographical regions?

Relationship between Rotavirus and Geographical Regions

Rotavirus disease exhibits seasonal patterns that vary by geographical region. In tropical belts near the equator, rotavirus disease is present year-round, with subtle changes in local climate influencing its seasonal cycling (Global Seasonality of Rotavirus Disease pages 7-8). In contrast, tropical regions exhibit less defined seasonal patterns.

In temperate regions, rotavirus activity typically peaks during late winter or early spring, with a 6-month difference between the peaks in the Northern and Southern Hemispheres (a.2014arehospitalizationsfor pages 1-1). The study found that weekly rotavirus activity could be explained by meteorologic factors in 82% of cases (p<0.001) with a one-week lag, associated with mean temperature, solar radiation, at

## Evaluate Respose