# Embeddings Module

### Install necessary modules

In [1]:
%pip install -U -e ..
%pip install -e ..
%pip install pydantic==2.9.2
%pip install nest_asyncio

Obtaining file:///Users/jose/Repos/MINE-DD
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Collecting pydantic>=2.10.1 (from minedd==0.1.0)
  Using cached pydantic-2.11.2-py3-none-any.whl.metadata (64 kB)
Collecting pydantic-core==2.33.1 (from pydantic>=2.10.1->minedd==0.1.0)
  Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.8 kB)
Using cached pydantic-2.11.2-py3-none-any.whl (443 kB)
Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
Building wheels for collected packages: minedd
  Building editable for minedd (pyproject.toml) ... [?25ldone
[?25h  Created wheel for minedd: filename=minedd-0.1.0-0.editable-py3-none-any.whl size=13035 sha256=53f0dec50bb79fae464e305f85d95cec35538eec517446c155f7cac9cd2dd9ce
  Stored in directory: /

In [2]:
import nest_asyncio
nest_asyncio.apply()

### Load Embeddings object

In [1]:
from minedd.embeddings import Embeddings

embeddings = Embeddings(
    output_embeddings_path="my-embeddings.pkl",
)
embeddings

Embeddings(output_embeddings_path=my-embeddings.pkl, docs=None)

### Load papers for processing

In [3]:
from pathlib import Path
PAPERS_DIRECTORY = Path.home() / "papers_minedd/"
pdf_file_list = embeddings.prepare_papers(PAPERS_DIRECTORY)
print(len(pdf_file_list))
# Check the first 10 files
pdf_file_list[:10]

11


['Ambient temperature and age-related notified Campylobacter infection in Israel_ A 12-year time series study.pdf',
 'A comparison of weather variables linked to infectious disease patterns using laboratory addresses and patient residence addresses.pdf',
 '.DS_Store',
 '_i_Campylobacter__i_ Monitoring in German Broiler Flocks_ An Explorative Time Series Analysis.pdf',
 'A time series analysis of the relationship of ambient temperature and common bacterial enteric infections in two Canadian provinces.pdf',
 'A Bayesian spatio-temporal framework to identify outbreaks and examine environmental and social risk factors for infectious diseases monitored by routine surveillance.pdf',
 'A time-series study of the association of rainfall_ relative humidity and ambient temperature with hospitalizations for rotavirus and norovirus infection among children in Hong Kong.pdf',
 'Weather Variability and the Incidence of Cryptosporidiosis_ Comparison of Time Series Poisson Regression and SARIMA Models

### Define paperQA settings

In [None]:
from paperqa.settings import Settings, AgentSettings

MODEL = "ollama/llama3.2"
EMBEDDING = "ollama/mxbai-embed-large:latest"

"""Configure settings for paperqa."""
local_llm_config = {
    "model_list": [
        {
            "model_name": MODEL,
            "litellm_params": {
                "model": MODEL,
                "api_base": "http://0.0.0.0:11434",
            },
            "answer": {
                "evidence_k": 40,
                "evidence_detailed_citations": True,
                "evidence_summary_length": "about 100 words",
                "answer_max_sources": 10,
                "answer_length": "about 600 words, but can be longer",
                "max_concurrent_requests": 10,
                "answer_filter_extra_background": False
            },
            "parsing": {
                "use_doc_details": True
            }
        }
    ]
}

settings = Settings(
    llm=MODEL,
    llm_config=local_llm_config,
    summary_llm=MODEL,
    summary_llm_config=local_llm_config,
    paper_directory=str(PAPERS_DIRECTORY),
    embedding=EMBEDDING,
    agent=AgentSettings(
        agent_llm=MODEL,
        agent_llm_config=local_llm_config,
        return_paper_metadata=True
    )
)

### Create Embeddings and save in PKL

In [None]:
embeddings.process_papers(settings, PAPERS_DIRECTORY, pdf_file_list)
print("Embeddings created and saved to my-embeddings.pkl")

### Inspect Documents Object

In [None]:
def get_doc_details(doc_detail, verbose=False):
    def get_safe_key(doc_detail, key):
        try:
            return getattr(doc_detail, key)
        except AttributeError:
            return None

    details_dict = {
        "key": get_safe_key(doc_detail, "key"),
        "docname":get_safe_key(doc_detail, "docname"),
        "title": get_safe_key(doc_detail, "title"),
        "authors": get_safe_key(doc_detail, "authors"),
        "year": get_safe_key(doc_detail, "year"),
        "journal": get_safe_key(doc_detail, "journal"),
        "volume": get_safe_key(doc_detail, "volume"),
        "pages": get_safe_key(doc_detail, "pages"),
        "doi": get_safe_key(doc_detail, "doi"),
        "url": get_safe_key(doc_detail, "url"),
    }

    # Print for informative purposes
    if verbose:
        for k, v in details_dict.items():
            print(f"{k}: {v}")

    return details_dict

for doc_key in embeddings.docs.docs.keys():
    doc_detail = embeddings.docs.docs.get(doc_key)
    print(doc_detail)
    get_doc_details(doc_detail, verbose=True)
    print("-" * 80)