# Embeddings Module

### Install necessary modules

In [1]:
%pip install -U -e ..
%pip install -e ..
%pip install pydantic==2.9.2
%pip install nest_asyncio

Obtaining file:///Users/jose/Repos/MINE-DD
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Collecting pydantic>=2.10.1 (from minedd==0.1.0)
  Using cached pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
Collecting pydantic-core==2.33.1 (from pydantic>=2.10.1->minedd==0.1.0)
  Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.8 kB)
Using cached pydantic-2.11.3-py3-none-any.whl (443 kB)
Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
Building wheels for collected packages: minedd
  Building editable for minedd (pyproject.toml) ... [?25ldone
[?25h  Created wheel for minedd: filename=minedd-0.1.0-0.editable-py3-none-any.whl size=13035 sha256=31791f26e9e1b10bf77681f59e60988028fb0dbb8a10c4179d7abf3c7d437198
  Stored in directory: /

In [2]:
# This is needed to run asyncio code in Jupyter notebooks
# without getting "RuntimeError: This event loop is already running"
import nest_asyncio
nest_asyncio.apply()

### Load Embeddings object

In [3]:
from minedd.embeddings import Embeddings

embeddings = Embeddings(
    output_embeddings_path="my-embeddings.pkl",
)
embeddings

Embeddings(output_embeddings_path=my-embeddings.pkl, docs=None)

### Load papers for processing

In [4]:
from pathlib import Path
PAPERS_DIRECTORY = Path.home() / "papers_minedd/"
pdf_file_list = embeddings.prepare_papers(PAPERS_DIRECTORY)
print(len(pdf_file_list))
# Check the first 10 files
pdf_file_list[:10]

11


['Ambient temperature and age-related notified Campylobacter infection in Israel_ A 12-year time series study.pdf',
 'A comparison of weather variables linked to infectious disease patterns using laboratory addresses and patient residence addresses.pdf',
 '.DS_Store',
 '_i_Campylobacter__i_ Monitoring in German Broiler Flocks_ An Explorative Time Series Analysis.pdf',
 'A time series analysis of the relationship of ambient temperature and common bacterial enteric infections in two Canadian provinces.pdf',
 'A Bayesian spatio-temporal framework to identify outbreaks and examine environmental and social risk factors for infectious diseases monitored by routine surveillance.pdf',
 'A time-series study of the association of rainfall_ relative humidity and ambient temperature with hospitalizations for rotavirus and norovirus infection among children in Hong Kong.pdf',
 'Weather Variability and the Incidence of Cryptosporidiosis_ Comparison of Time Series Poisson Regression and SARIMA Models

### Define paperQA settings

In [5]:
from paperqa.settings import Settings, AgentSettings, ParsingSettings

MODEL = "ollama/llama3.2"
EMBEDDING = "ollama/mxbai-embed-large:latest"

"""Configure settings for paperqa."""
local_llm_config = {
    "model_list": [
        {
            "model_name": MODEL,
            "litellm_params": {
                "model": MODEL,
                "api_base": "http://0.0.0.0:11434",
            },
            "answer": {
                "evidence_k": 40,
                "evidence_detailed_citations": True,
                "evidence_summary_length": "about 100 words",
                "answer_max_sources": 10,
                "answer_length": "about 600 words, but can be longer",
                "max_concurrent_requests": 10,
                "answer_filter_extra_background": False
            },
            "parsing": {
                "use_doc_details": True
            }
        }
    ]
}

settings = Settings(
    llm=MODEL,
    llm_config=local_llm_config,
    summary_llm=MODEL,
    summary_llm_config=local_llm_config,
    paper_directory=str(PAPERS_DIRECTORY),
    embedding=EMBEDDING,
    agent=AgentSettings(
        agent_llm=MODEL,
        agent_llm_config=local_llm_config,
        return_paper_metadata=True
    ),
    parsing=ParsingSettings(
        chunk_size=2500,
        overlap=250
    )
)

### Create/Load Embeddings and save in PKL

In [6]:
import os
if os.path.exists("my-embeddings.pkl"):
    embeddings.load_existing_embeddings("my-embeddings.pkl")
else:
    embeddings.process_papers(settings, PAPERS_DIRECTORY, pdf_file_list)
    print("Embeddings created and saved to my-embeddings.pkl")

Creating new Docs object.


  from .autonotebook import tqdm as notebook_tqdm
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
CROSSREF_MAILTO environment variable not set. Crossref API rate limits may apply.
CROSSREF_API_KEY environment variable not set. Crossref API rate limits may apply.
Metadata not found for Ambient temperature and age-related notified Campylobacter infection in TIsrael: A 12-year time series study in SemanticScholarProvider.
Request to CrossrefProvider for Ambient temperature and age-related notified Campylobacter infection in TIsrael: A 12-year time series study timed out.
  9%|▉         | 1/11 [00:19<03:12, 19.22s/it]

Correctly loaded Ambient temperature and age-related notified Campylobacter infection in Israel_ A 12-year time series study.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {title: str, authors: list[str], doi: str}, consider using a manifest file or specifying a different citation prompt.
 18%|█▊        | 2/11 [00:44<03:27, 23.02s/it]

Correctly loaded A comparison of weather variables linked to infectious disease patterns using laboratory addresses and patient residence addresses.pdf
Could not read .DS_Store: ParsedText.content must be a `list`, not <class 'str'>.


SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
 36%|███▋      | 4/11 [01:04<01:43, 14.79s/it]

Correctly loaded _i_Campylobacter__i_ Monitoring in German Broiler Flocks_ An Explorative Time Series Analysis.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {
    "title": title,
    "authors": authors,
    "doi": doi
}, consider using a manifest file or specifying a different citation prompt.
 45%|████▌     | 5/11 [01:27<01:43, 17.32s/it]

Correctly loaded A time series analysis of the relationship of ambient temperature and common bacterial enteric infections in two Canadian provinces.pdf


SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
 55%|█████▍    | 6/11 [01:43<01:23, 16.79s/it]

Correctly loaded A Bayesian spatio-temporal framework to identify outbreaks and examine environmental and social risk factors for infectious diseases monitored by routine surveillance.pdf


SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
Request to CrossrefProvider for A time-series study of the association of rainfall, relative humidity and ambient temperature with hospitalizations for rotavirus and norovirus infection among children in Hong Kong timed out.
 64%|██████▎   | 7/11 [02:00<01:08, 17.04s/it]

Correctly loaded A time-series study of the association of rainfall_ relative humidity and ambient temperature with hospitalizations for rotavirus and norovirus infection among children in Hong Kong.pdf


SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
 73%|███████▎  | 8/11 [02:18<00:52, 17.35s/it]

Correctly loaded Weather Variability and the Incidence of Cryptosporidiosis_ Comparison of Time Series Poisson Regression and SARIMA Models.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {
    'title': title,
    'authors': authors,
    'doi': doi
}, consider using a manifest file or specifying a different citation prompt.
 82%|████████▏ | 9/11 [02:39<00:36, 18.19s/it]

Correctly loaded _i_Campylobacter__i_epidemiology_ a descriptive study reviewing 1 million cases in England and Wales between 1989 and 2011.pdf


 91%|█████████ | 10/11 [03:03<00:20, 20.03s/it]

Correctly loaded Weather and notified Campylobacter infections in temperate and sub-tropical regions of Australia_ An ecological study.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {"title": None, "authors": [], "doi": None}, consider using a manifest file or specifying a different citation prompt.
100%|██████████| 11/11 [03:21<00:00, 18.32s/it]

Correctly loaded Are hospitalizations for rotavirus gastroenteritis associated with meteorologic factors_.pdf
Docs object saved to my-embeddings.pkl
Embeddings created and saved to my-embeddings.pkl





### Inspect Documents Object

In [7]:
detail_df = embeddings.get_docs_details()
detail_df

Unnamed: 0,doc_key,key,docname,title,authors,year,journal,volume,pages,doi,url,file_location,citation_count,source_quality
0,ef5fb575eb743e9e2ddf54e5e0c9743e,rosenbergaUnknownyearambienttemperatureand,Ambient temperature and age-related notified C...,Ambient temperature and age-related notified C...,"[Alina Rosenberga, Miriam Weinberger, Shlomit ...",,,,,,,,,
1,5e1061700ab7312396d7b7fd503e8c2f,,A comparison of weather variables linked to in...,,,,,,,,,,,
2,82429582fe1ae21b,s.2009campylobactermonitoringin,s.2009campylobactermonitoringin,Campylobacter Monitoring in German Broiler Flo...,"[Hartnack, S., Doherr, M. G., Alter, T., Touto...",2009.0,Zoonoses and Public Health,56.0,,10.1111/j.1863-2378.2008.01184.x,,,31.0,2.0
3,bf02d614f82ce026fa308eeeaf192053,,A time series analysis of the relationship of ...,,,,,,,,,,,
4,b924171c933be72d,aparna2018abayesianspatiotemporal,aparna2018abayesianspatiotemporal,A Bayesian spatio-temporal framework to identi...,"[Lal, Aparna, Marshall, Jonathan, Benschop, Ja...",2018.0,Spatial and Spatio-temporal Epidemiology,25.0,39-48,10.1016/j.sste.2017.10.004,https://doi.org/10.1016/j.sste.2017.10.004,,13.0,1.0
5,b6a6137315ccac14,pin2018atimeseriesstudy,pin2018atimeseriesstudy,A time-series study of the association of rain...,"[Wang, Pin, William B. Goggins, Emily Y.Y. Chan]",2018.0,The Science of the total environment,643.0,\n414-422\n,10.1016/j.scitotenv.2018.06.189,,,69.0,-1.0
6,bed463ca3b0dd66e,hu2007weathervariabilityand,hu2007weathervariabilityand,Weather Variability and the Incidence of Crypt...,"[Wenbiao Hu, Shilu Tong, Kerrie Mengeresen, De...",2007.0,Annals of Epidemiology,17.0,679-688,10.1016/j.annepidem.2007.03.020,https://doi.org/10.1016/j.annepidem.2007.03.020,,82.0,1.0
7,aff6d96d1f88ea6d15ef934239c39e02,,_i_Campylobacter__i_epidemiology_ a descriptiv...,,,,,,,,,,,
8,e2e754915d56cacf7780abef4f1fdf7b,,Weather and notified Campylobacter infections ...,,,,,,,,,,,
9,b696c2a7a6e6c7df3d1d59c628c1b725,,Are hospitalizations for rotavirus gastroenter...,,,,,,,,,,,


In [8]:
print(len(embeddings.docs.texts))
for doc in embeddings.docs.texts:
    print(len(doc.text))

219
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2160
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2351
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2026
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2059
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2094
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
1637
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
1340
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
1296
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2500
2