# Embeddings Module

## Install necessary modules

In [1]:
%pip install -U -e ..
%pip install -e ..

Obtaining file:///Users/jose/Repos/MINE-DD
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: minedd
  Building editable for minedd (pyproject.toml) ... [?25ldone
[?25h  Created wheel for minedd: filename=minedd-0.1.0-0.editable-py3-none-any.whl size=14034 sha256=15317f8fa21aec87792e16f5bde4774f19878dd4cef20c23eec315e0945e1131
  Stored in directory: /private/var/folders/79/zf67ls7520x9m4mj7nx6q07w0000gp/T/pip-ephem-wheel-cache-3woj_ipx/wheels/c7/bf/1e/e7790fb2ba7cdeeb0fd2dc5eb82103f3cd4a31d3c897656e15
Successfully built minedd
Installing collected packages: minedd
  Attempting uninstall: minedd
    Found existing installation: minedd 0.1.0
    Uninstalling minedd-0.1.0:
      Successfully uninstalled minedd-0.1.0
Successfully installed minedd-0.1.0


In [2]:
# This is needed to run asyncio code in Jupyter notebooks
# without getting "RuntimeError: This event loop is already running"
import nest_asyncio
nest_asyncio.apply()

### Load Embeddings object

In [3]:
from minedd.embeddings import Embeddings
from pathlib import Path

MODEL = "ollama/llama3.2"
EMBEDDING = "ollama/mxbai-embed-large:latest"
PAPERS_DIRECTORY = Path.home() / "papers_minedd/"

embeddings = Embeddings(
    model=MODEL,
    embedding_model=EMBEDDING,
    paper_directory=PAPERS_DIRECTORY,
    output_embeddings_path="outputs/my-embeddings.pkl",
)
embeddings

Embeddings(output_embeddings_path=outputs/my-embeddings.pkl, total_docs=0, total_chunks=0)

## Load papers for processing

In [4]:
pdf_file_list = embeddings.prepare_papers()
print(len(pdf_file_list))
# Check the first 10 files
pdf_file_list[:10]

5


['Seasonality of Rotavirus Hospitalizations at Costa Rica_s National Children_s Hospital in 2010_2015.pdf',
 'Seasonality of rotavirus disease in the tropics_ a systematic review and meta-analysis.pdf',
 'Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf',
 'Global Seasonality of Rotavirus Disease.pdf',
 'Are hospitalizations for rotavirus gastroenteritis associated with meteorologic factors_.pdf']

## Create/Load Embeddings and save in PKL

In [5]:
import os
embeddings_dir = "outputs/my-embeddings.pkl"

if os.path.exists(embeddings_dir):
    embeddings.load_existing_embeddings(embeddings_dir)
else:
    embeddings.process_papers(pdf_file_list)
    print(f"Embeddings created and saved to {embeddings_dir}")

Loaded existing embeddings from outputs/my-embeddings.pkl


## Inspect Documents Object

In [6]:
detail_df = embeddings.get_docs_details()
detail_df

Unnamed: 0,doc_key,key,docname,title,authors,year,journal,volume,pages,doi,url,file_location,citation_count,source_quality
0,8c73cc305a626c84f88e03fe5563b2ce,,Seasonality of Rotavirus Hospitalizations at C...,,,,,,,,,,,
1,f57b64122eb55199,karen2009seasonalityofrotavirus,karen2009seasonalityofrotavirus,Seasonality of rotavirus disease in the tropic...,"[Levy, Karen, Alan E Hubbard, Joseph NS Eisenb...",2009.0,International journal of epidemiology,38 6,\n1487-96\n,10.1093/ije/dyn260,https://academic.oup.com/ije/article-pdf/38/6/...,,267.0,3.0
2,c127a43a8472a75f908ffbb2617c1af2,jsUnknownyearseasonalityofrotavirus,Seasonality of Rotavirus in South Asia A Meta ...,Seasonality of Rotavirus in South Asia: A Meta...,"[Jagai JS, Sarkar R, Castronovo D, Kattula D, ...",,,,,,,,,
3,8f1ccb503494a254,patel2013globalseasonalityof,patel2013globalseasonalityof,Global Seasonality of Rotavirus Disease,"[Manish M. Patel, Virginia E. Pitzer, Wladimir...",2013.0,Pediatric Infectious Disease Journal,32,e134-e147,10.1097/inf.0b013e31827d3b68,https://doi.org/10.1097/inf.0b013e31827d3b68,,155.0,1.0
4,2de90bd8cde7df74,hervas2014arehospitalizationsfor,hervas2014arehospitalizationsfor,Are hospitalizations for rotavirus gastroenter...,"[D. Hervás, J. Hervás-Masip, A. Rosell, A. Men...",2014.0,European Journal of Clinical Microbiology &amp...,33,1547-1553,10.1007/s10096-014-2106-y,https://doi.org/10.1007/s10096-014-2106-y,,22.0,-1.0


In [7]:
n_chunks = len(embeddings.docs.texts)
for doc in embeddings.docs.texts:
    emb_dims = len(doc.embedding)
    print(len(doc.text))
    print(doc.doc.docname)
    break

print(f"Embeddings Dimsenions [{n_chunks}, {emb_dims}]")

2500
Seasonality of Rotavirus Hospitalizations at Costa Rica s National Children s Hospital in 2010 2015
Embeddings Dimsenions [193, 1024]


## Query Specific Doc

In [11]:
chunks = embeddings.get_document_chunks(docname="karen2009seasonalityofrotavirus")
print(len(chunks))
for chunk in chunks:
    print(chunk.pages)
    print("\n")

assert len(chunks) == 64

chunks = embeddings.get_document_chunks(docname="karen2009seasonalityofrotavirus", pages=[1,2])
assert len(chunks) == 7

chunks = embeddings.get_document_chunks(docname="karen2009seasonalityofrotavirus", pages=[1000, "-"])
assert len(chunks) == 0

chunks = embeddings.get_document_chunks(docname="karen2009seasonalityofrotavirus", pages=[6])
for ch in chunks:
    print(f"---- Pages {ch.pages} ----")
    [print("> ",line) for line in ch.text.split("\n")]
    print("\n")

64
1-1


1-1


1-2


2-2


2-2


2-2


2-3


3-3


3-3


3-3


3-3


3-4


4-4


4-4


4-4


4-4


4-4


4-4


4-4


4-4


4-4


4-4


4-5


5-5


5-5


5-5


5-5


5-5


5-5


5-5


5-6


6-6


6-6


6-6


6-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-7


7-8


8-8


8-8


8-8


8-9


9-9


9-9


9-9


9-10


10-10


10-10


---- Pages 5-6 ----
>                                                                                   regression        month.     dence of rotavirus.                                                                         Samoa                      of
>                                                                                                                                          every    Discussion                                                                                                                            slope        The results of this review suggest that numbers of        