# Loading Data

In [1]:
# pip install sentence-transformers
# pip install langchain

In [2]:
from pathlib import Path
from minedd.document import DocumentPDF, DocumentMarkdown
from IPython.display import Markdown, display

PAPERS_DIR = Path.home() / "papers_minedd"

test_paper = PAPERS_DIR / "Seasonality of rotavirus disease in the tropics_ a systematic review and meta-analysis.pdf"

pdf_paper = DocumentPDF(pdf_path=str(test_paper))

  from .autonotebook import tqdm as notebook_tqdm


Loaded layout model s3://layout/2025_02_18 on device mps with dtype torch.float16
Loaded texify model s3://texify/2025_02_18 on device mps with dtype torch.float16
Loaded recognition model s3://text_recognition/2025_02_18 on device mps with dtype torch.float16
Loaded table recognition model s3://table_recognition/2025_02_18 on device mps with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device mps with dtype torch.float16
Loaded detection model s3://inline_math_detection/2025_02_24 on device mps with dtype torch.float16


## PDF to Markdown

In [3]:
try:
    markdown_text = DocumentMarkdown(md_path="outputs/paper_text.md").get_markdown()
except FileNotFoundError:
    markdown_text = pdf_paper.get_markdown()
    with open("outputs/paper_text.md", "w") as f:
        f.write(markdown_text)

markdown_paper = DocumentMarkdown(md_content=markdown_text, md_path="outputs/paper_text.md")
# display(Markdown(markdown_text))



In [4]:
# Not Satisfactory, will create our own MD Chunker
# # pip install langchain-community
# # pip install unstructured
# # pip install markdown

# from langchain_community.document_loaders import UnstructuredMarkdownLoader

# loader = UnstructuredMarkdownLoader(
#     "outputs/paper_text.md",
#     mode="elements",
#     strategy="fast",
# )

# docs = loader.load()
# for doc in docs:
#     print(doc.metadata)
#     print(len(doc.page_content))
#     print(doc.page_content[:100])  # Print the first 100 characters of the content
#     print(doc.page_content[-100:])  # Print the last 100 characters of the content
#     print("\n---\n")  # Separator for clarity

In [5]:
markdown_paper.get_markdown(only_text=True, remove_references=True)[-1000:]

'cal variables provide a better predictor of seasonality in the tropics, where weather patterns differ from those in the temperate zones. This review reveals a trend for rotavirus to occur in the cool, dry seasons in tropical countries, as observed in temperate zones. These results suggest that paying close attention to local climatic conditions will improve our understanding of the transmission and epidemiology of rotavirus disease.  # Funding  National Institutes of Allergy and Infectious Diseases (R01AI050038).  Conflict of interest: None declared.  # KEY MESSAGES  - Incidence of rotavirus disease responds to changes in climate in the tropics, with the highest number of infections found at the colder and drier times of the year. - Monthly rotavirus incidence is significantly negatively correlated with temperature, rainfall, and relative humidity in the majority of studies reviewed. - Rotavirus previously was not thought to respond to seasonal changes in tropical regions of the world

### Get Chunks from Markdown

In [6]:
# TODO: paginate markdown before passing it to the splitter (just pass each page independently) and keep the chunk metadata
chunks = markdown_paper.convert_to_chunks(mode="chars",chunk_size=1500, overlap=100)
print(len(chunks))
with open("outputs/paper_chunks.txt", "w") as f:
    for i, chunk in enumerate(chunks):
        f.write(f"\n----- Chunk {i + 1} (Size {len(chunk)} chars) -----\n{chunk}\n")

35


### Quick RAG

In [None]:
# pip install chromadb
# pip install sentence-transformers
import chromadb
from sentence_transformers import SentenceTransformer

try:
    client.delete_collection(name="paper_chunks")
except Exception:
    pass


client = chromadb.PersistentClient(path="outputs/chroma_db")
paper_collection = client.create_collection(name="paper_chunks")
text_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')



for i, chunk in enumerate(chunks):
    embedding = text_embedding_model.encode(chunk)
    paper_collection.add(
        ids=[f"chunk_{i + 1}"],
        documents=[chunk],
        embeddings=[embedding.tolist()],
        metadatas=[{"chunk_id": i + 1, "source": str(markdown_paper.md_path), "title": markdown_paper.get_title()}],
        
    )
    print({"chunk_id": i + 1, "source": markdown_paper.md_path, "title": markdown_paper.get_title()})

{'chunk_id': 1, 'source': 'outputs/paper_text.md', 'title': 'SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis'}
{'chunk_id': 2, 'source': 'outputs/paper_text.md', 'title': 'SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis'}
{'chunk_id': 3, 'source': 'outputs/paper_text.md', 'title': 'SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis'}
{'chunk_id': 4, 'source': 'outputs/paper_text.md', 'title': 'SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis'}
{'chunk_id': 5, 'source': 'outputs/paper_text.md', 'title': 'SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis'}
{'chunk_id': 6, 'source': 'outputs/paper_text.md', 'title': 'SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and me

In [10]:
def semantic_search(query, collection, top_k=3):
    query_embedding = text_embedding_model.encode(query)
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], n_results=top_k
    )
    return results
 
# def generate_response(query, context):
#     prompt = f"Query: {query}\nContext: {context}\nAnswer:"
#     response = completion(
#         model="gemini/gemini-1.5-flash",
#         messages=[{"content": prompt, "role": "user"}],
#         api_key=gemini_api_key
#     )
#     return response['choices'][0]['message']['content']

# Example usage
query = "Is rotavirus in waterbourne surfaces?"
results = semantic_search(query, paper_collection, top_k=3)
for i, doc in enumerate(results['documents'][0]):
    print(f"Result {i + 1}:")
    print(f"Chunk ID: {results['metadatas'][0][i]['chunk_id']}")
    print(f"Source: {results['metadatas'][0][i]['source']}")
    print(f"Title: {results['metadatas'][0][i]['title']}")
    print(f"Content: {doc}\n")

Result 1:
Chunk ID: 10
Source: outputs/paper_text.md
Title: SYSTEMATIC REVIEWS Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis
Content: | | Image: Background information on rotavirus seasonality                                    |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |                                                                                                                                                                                                                   