# Loading Data

In [None]:
# pip install sentence-transformers
# pip install langchain

In [None]:
from pathlib import Path
from minedd.document import DocumentPDF, DocumentMarkdown
from IPython.display import Markdown, display

PAPERS_DIR = Path.home() / "papers_minedd"

test_paper = PAPERS_DIR / "Seasonality of rotavirus disease in the tropics_ a systematic review and meta-analysis.pdf"

pdf_paper = DocumentPDF(pdf_path=str(test_paper))

In [None]:
docs = pdf_paper.get_grobid_chunks()
if len(docs) > 0:
    print(len(docs))
    print(docs[0].metadata.keys())
    print(docs[0])

## PDF to Chunks

### JSON Format = REQUIRES GROBID!

In [None]:
import json
docs = pdf_paper.get_grobid_chunks(return_as_dict=True, group_dict_by_section=True)
with open("outputs/test_paper_grobid.json", "w") as f:
    json.dump(docs, f, indent=4)

### Text or LangChain Docs

In [None]:
# docs = pdf_paper.get_chunks(as_langchain_docs=True)
# print(len(docs))
# print(docs[0])

## PDF Tables

In [None]:
tables = pdf_paper.get_document_tables()
len(tables)

In [None]:
import pandas as pd
for t in tables:    
    md = t.to_markdown()
    json_df = t.to_dict()
    reborn_df = pd.DataFrame(json_df)
    display(Markdown(md))
    print(json_df)
    print(reborn_df.head(10))
    print("\n========================================\n")

## PDF to Markdown

In [None]:
try:
    markdown_text = DocumentMarkdown(md_path="outputs/paper_text.md").get_markdown()
except FileNotFoundError:
    markdown_text = pdf_paper.get_markdown()
    with open("outputs/paper_text.md", "w") as f:
        f.write(markdown_text)

markdown_paper = DocumentMarkdown(md_content=markdown_text, md_path="outputs/paper_text.md")
# display(Markdown(markdown_text))

In [None]:
# Not Satisfactory, will create our own MD Chunker
# # pip install langchain-community
# # pip install unstructured
# # pip install markdown

# from langchain_community.document_loaders import UnstructuredMarkdownLoader

# loader = UnstructuredMarkdownLoader(
#     "outputs/paper_text.md",
#     mode="elements",
#     strategy="fast",
# )

# docs = loader.load()
# for doc in docs:
#     print(doc.metadata)
#     print(len(doc.page_content))
#     print(doc.page_content[:100])  # Print the first 100 characters of the content
#     print(doc.page_content[-100:])  # Print the last 100 characters of the content
#     print("\n---\n")  # Separator for clarity

In [None]:
markdown_paper.get_markdown(only_text=True, remove_references=True)[-1000:]

### Get Chunks from Markdown

In [None]:
# TODO: paginate markdown before passing it to the splitter (just pass each page independently) and keep the chunk metadata
chunks = markdown_paper.convert_to_chunks(mode="chars",chunk_size=1500, overlap=100)
print(len(chunks))
with open("outputs/paper_chunks.txt", "w") as f:
    for i, chunk in enumerate(chunks):
        f.write(f"\n----- Chunk {i + 1} (Size {len(chunk)} chars) -----\n{chunk}\n")

### Quick RAG

In [None]:
# pip install chromadb
# pip install sentence-transformers
import chromadb
from sentence_transformers import SentenceTransformer

try:
    client.delete_collection(name="paper_chunks")
except Exception:
    pass


client = chromadb.PersistentClient(path="outputs/chroma_db")
paper_collection = client.create_collection(name="paper_chunks")
text_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')



for i, chunk in enumerate(chunks):
    embedding = text_embedding_model.encode(chunk)
    paper_collection.add(
        ids=[f"chunk_{i + 1}"],
        documents=[chunk],
        embeddings=[embedding.tolist()],
        metadatas=[{"chunk_id": i + 1, "source": str(markdown_paper.md_path), "title": markdown_paper.get_title()}],
        
    )
    print({"chunk_id": i + 1, "source": markdown_paper.md_path, "title": markdown_paper.get_title()})

In [None]:
def semantic_search(query, collection, top_k=3):
    query_embedding = text_embedding_model.encode(query)
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], n_results=top_k
    )
    return results
 
# def generate_response(query, context):
#     prompt = f"Query: {query}\nContext: {context}\nAnswer:"
#     response = completion(
#         model="gemini/gemini-1.5-flash",
#         messages=[{"content": prompt, "role": "user"}],
#         api_key=gemini_api_key
#     )
#     return response['choices'][0]['message']['content']

# Example usage
query = "Is rotavirus in waterbourne surfaces?"
results = semantic_search(query, paper_collection, top_k=3)
for i, doc in enumerate(results['documents'][0]):
    print(f"Result {i + 1}:")
    print(f"Chunk ID: {results['metadatas'][0][i]['chunk_id']}")
    print(f"Source: {results['metadatas'][0][i]['source']}")
    print(f"Title: {results['metadatas'][0][i]['title']}")
    print(f"Content: {doc}\n")