# Embeddings Module

### Install necessary modules

In [1]:
%pip install -U -e ..
%pip install -e ..
%pip install pydantic==2.9.2
%pip install nest_asyncio

Obtaining file:///Users/jose/Repos/MINE-DD
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Collecting pydantic>=2.10.1 (from minedd==0.1.0)
  Using cached pydantic-2.11.2-py3-none-any.whl.metadata (64 kB)
Collecting pydantic-core==2.33.1 (from pydantic>=2.10.1->minedd==0.1.0)
  Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.8 kB)
Using cached pydantic-2.11.2-py3-none-any.whl (443 kB)
Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
Building wheels for collected packages: minedd
  Building editable for minedd (pyproject.toml) ... [?25ldone
[?25h  Created wheel for minedd: filename=minedd-0.1.0-0.editable-py3-none-any.whl size=13035 sha256=c474645a96702995f1de92b6d8d63bdb39bd77a2659205d44712d5e5102c395e
  Stored in directory: /

In [2]:
# This is needed to run asyncio code in Jupyter notebooks
# without getting "RuntimeError: This event loop is already running"
import nest_asyncio
nest_asyncio.apply()

### Load Embeddings object

In [3]:
from minedd.embeddings import Embeddings

embeddings = Embeddings(
    output_embeddings_path="my-embeddings.pkl",
)
embeddings

Embeddings(output_embeddings_path=my-embeddings.pkl, docs=None)

### Load papers for processing

In [4]:
from pathlib import Path
PAPERS_DIRECTORY = Path.home() / "papers_minedd/"
pdf_file_list = embeddings.prepare_papers(PAPERS_DIRECTORY)
print(len(pdf_file_list))
# Check the first 10 files
pdf_file_list[:10]

11


['Ambient temperature and age-related notified Campylobacter infection in Israel_ A 12-year time series study.pdf',
 'A comparison of weather variables linked to infectious disease patterns using laboratory addresses and patient residence addresses.pdf',
 '.DS_Store',
 '_i_Campylobacter__i_ Monitoring in German Broiler Flocks_ An Explorative Time Series Analysis.pdf',
 'A time series analysis of the relationship of ambient temperature and common bacterial enteric infections in two Canadian provinces.pdf',
 'A Bayesian spatio-temporal framework to identify outbreaks and examine environmental and social risk factors for infectious diseases monitored by routine surveillance.pdf',
 'A time-series study of the association of rainfall_ relative humidity and ambient temperature with hospitalizations for rotavirus and norovirus infection among children in Hong Kong.pdf',
 'Weather Variability and the Incidence of Cryptosporidiosis_ Comparison of Time Series Poisson Regression and SARIMA Models

### Define paperQA settings

In [5]:
from paperqa.settings import Settings, AgentSettings

MODEL = "ollama/llama3.2"
EMBEDDING = "ollama/mxbai-embed-large:latest"

"""Configure settings for paperqa."""
local_llm_config = {
    "model_list": [
        {
            "model_name": MODEL,
            "litellm_params": {
                "model": MODEL,
                "api_base": "http://0.0.0.0:11434",
            },
            "answer": {
                "evidence_k": 40,
                "evidence_detailed_citations": True,
                "evidence_summary_length": "about 100 words",
                "answer_max_sources": 10,
                "answer_length": "about 600 words, but can be longer",
                "max_concurrent_requests": 10,
                "answer_filter_extra_background": False
            },
            "parsing": {
                "use_doc_details": True
            }
        }
    ]
}

settings = Settings(
    llm=MODEL,
    llm_config=local_llm_config,
    summary_llm=MODEL,
    summary_llm_config=local_llm_config,
    paper_directory=str(PAPERS_DIRECTORY),
    embedding=EMBEDDING,
    agent=AgentSettings(
        agent_llm=MODEL,
        agent_llm_config=local_llm_config,
        return_paper_metadata=True
    )
)

### Create Embeddings and save in PKL

In [6]:
embeddings.process_papers(settings, PAPERS_DIRECTORY, pdf_file_list)
print("Embeddings created and saved to my-embeddings.pkl")

Creating new Docs object.


  from .autonotebook import tqdm as notebook_tqdm
Discarding list of DOIs [None, None, None, None] due to it not having one value, full data was {'doi': [None, None, None, None], 'authors': [['Weinberger, Miriam', 'Shlomit Paz', 'Lea Valinskye', 'Vered Agmon', 'Chava Peretz', 'others'], None, None, None], 'title': ['Ambient Temperature and Age-Related Notiﬁed Campylobacter Infection in TIsrael: A 12-Year Time Series Study.', 'Campylobacter species and human infection', 'EFSA Scientific Opinion on the Risk Assessment of Campylobacter in poultry and laying hens in relation to human health', 'Campylobacter']}.
  9%|▉         | 1/11 [00:16<02:48, 16.86s/it]

Could not read Ambient temperature and age-related notified Campylobacter infection in Israel_ A 12-year time series study.pdf: 'list' object has no attribute 'lower'


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {
    'title': title,
    'authors': authors,
    'doi': doi
}, consider using a manifest file or specifying a different citation prompt.
 18%|█▊        | 2/11 [00:32<02:26, 16.27s/it]

Correctly loaded A comparison of weather variables linked to infectious disease patterns using laboratory addresses and patient residence addresses.pdf
Could not read .DS_Store: ParsedText.content must be a `list`, not <class 'str'>.


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {
        'title': title,
        'authors': authors,
        'doi': doi
    }, consider using a manifest file or specifying a different citation prompt.
 36%|███▋      | 4/11 [00:51<01:23, 11.95s/it]

Correctly loaded _i_Campylobacter__i_ Monitoring in German Broiler Flocks_ An Explorative Time Series Analysis.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {
        "title": title,
        "authors": authors,
        "doi": doi
    }, consider using a manifest file or specifying a different citation prompt.
 45%|████▌     | 5/11 [01:04<01:14, 12.35s/it]

Correctly loaded A time series analysis of the relationship of ambient temperature and common bacterial enteric infections in two Canadian provinces.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {4,9}, consider using a manifest file or specifying a different citation prompt.
 55%|█████▍    | 6/11 [01:28<01:19, 15.93s/it]

Correctly loaded A Bayesian spatio-temporal framework to identify outbreaks and examine environmental and social risk factors for infectious diseases monitored by routine surveillance.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {'title': None, 'authors': [], 'doi': None}, consider using a manifest file or specifying a different citation prompt.
 64%|██████▎   | 7/11 [01:57<01:19, 19.89s/it]

Correctly loaded A time-series study of the association of rainfall_ relative humidity and ambient temperature with hospitalizations for rotavirus and norovirus infection among children in Hong Kong.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {
        "title": title,
        "authors": authors,
        "doi": doi
    }, consider using a manifest file or specifying a different citation prompt.
 73%|███████▎  | 8/11 [02:16<00:58, 19.47s/it]

Correctly loaded Weather Variability and the Incidence of Cryptosporidiosis_ Comparison of Time Series Poisson Regression and SARIMA Models.pdf


Failed to parse all of title, DOI, and authors from the ParsingSettings.structured_citation_prompt's response {4,9}, consider using a manifest file or specifying a different citation prompt.
 82%|████████▏ | 9/11 [02:35<00:38, 19.41s/it]

Correctly loaded _i_Campylobacter__i_epidemiology_ a descriptive study reviewing 1 million cases in England and Wales between 1989 and 2011.pdf


CROSSREF_MAILTO environment variable not set. Crossref API rate limits may apply.
CROSSREF_API_KEY environment variable not set. Crossref API rate limits may apply.
SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
 91%|█████████ | 10/11 [02:47<00:17, 17.14s/it]

Correctly loaded Weather and notified Campylobacter infections in temperate and sub-tropical regions of Australia_ An ecological study.pdf


SEMANTIC_SCHOLAR_API_KEY environment variable not set. Semantic Scholar API rate limits may apply.
100%|██████████| 11/11 [03:06<00:00, 16.95s/it]

Correctly loaded Are hospitalizations for rotavirus gastroenteritis associated with meteorologic factors_.pdf
Docs object saved to my-embeddings.pkl
Embeddings created and saved to my-embeddings.pkl





### Inspect Documents Object

In [7]:
detail_df = embeddings.get_docs_details()
detail_df

Unnamed: 0,doc_key,key,docname,title,authors,year,journal,volume,pages,doi,url,file_location,citation_count,source_quality
0,5e1061700ab7312396d7b7fd503e8c2f,,A comparison of weather variables linked to in...,,,,,,,,,,,
1,3bc9bfea106e4f86c21c162e72c5e9f0,,_i_Campylobacter__i_ Monitoring in German Broi...,,,,,,,,,,,
2,bf02d614f82ce026fa308eeeaf192053,,A time series analysis of the relationship of ...,,,,,,,,,,,
3,140bd9764073390a4631a93ce7ffe0ec,,A Bayesian spatio-temporal framework to identi...,,,,,,,,,,,
4,16eda9d66c9bfaa5a431fe765cc27318,,A time-series study of the association of rain...,,,,,,,,,,,
5,9e26ccef2cfc3b32fa4c2f143c32c296,,Weather Variability and the Incidence of Crypt...,,,,,,,,,,,
6,aff6d96d1f88ea6d15ef934239c39e02,,_i_Campylobacter__i_epidemiology_ a descriptiv...,,,,,,,,,,,
7,e1fb9238ba24b9e1,bi2008weatherandnotified,bi2008weatherandnotified,Weather and notified Campylobacter infections ...,"[Peng Bi, A. Scott Cameron, Ying Zhang, Kevin ...",2008.0,The Journal of infection,57 4,\n317-23\n,10.1016/j.jinf.2008.08.004,,,61.0,-1.0
8,2de90bd8cde7df74,d.2014arehospitalizationsfor,d.2014arehospitalizationsfor,Are hospitalizations for rotavirus gastroenter...,"[Hervás, D., Hervás-Masip, J., Rosell, A., Men...",2014.0,European Journal of Clinical Microbiology & In...,33,1547-1553,10.1007/s10096-014-2106-y,,,22.0,-1.0
