## Using PaperQA 

In [None]:
from pathlib import Path
from paperqa.readers import parse_pdf_to_pages

PAPERS_DIR = Path.home() / "papers_minedd"

test_paper = PAPERS_DIR / "Seasonality_of_rotavirus_disease_in_the_tropics_a_systematic_review_and_meta-analysis.pdf"

parsed_text = parse_pdf_to_pages(str(test_paper))
parsed_text

In [None]:
for k, v in parsed_text.content.items():
    print(f"Page {k}:")
    print(v[:100].replace('\n', ' '))
    print("--------")

## Using PyMuPDF Directly

In [None]:
import pymupdf4llm

# Table Strategies: https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
md_text = pymupdf4llm.to_markdown(test_paper, 
                                  page_chunks=False, 
                                  table_strategy="lines", 
                                  embed_images=False
                                  )
# Write the text to some file in UTF8-encoding
Path("output_pymupdf.md").write_bytes(md_text.encode())

In [None]:
len(md_text), md_text

# Extract PDF Tables with GMFT

In [None]:
from gmft.auto import AutoTableDetector, AutoTableFormatter
from gmft.pdf_bindings import PyPDFium2Document

def save_tables_in_multiple_formats(tables):
    for index, table in enumerate(tables):
        table.df().to_csv(f"output_tables/output_table_{index}.csv", index=False)
        # table.df().to_json(f"output_tables/output_table_{index}.json", orient='records')

def extract_tables(pdf_path):
    detector = AutoTableDetector()
    formatter = AutoTableFormatter()
    doc = PyPDFium2Document(pdf_path)
    tables = []
    for page in doc:
        tables += detector.extract(page)
    formatted_tables = [formatter.extract(table) for table in tables]
    return formatted_tables

# Extract tables from the PDF
tables = extract_tables(test_paper)
save_tables_in_multiple_formats(tables)

# Use Marker

## Full PDF into Markdown

In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser

# Configure Marker to use Ollama as the LLM service
config = {
    "output_format": "markdown",
    "use_llm": True,
    "llm_service": "marker.services.ollama.OllamaService",
    "ollama_model": "llama3.2:latest",  # Specify which model you want to use
    "ollama_base_url": "http://localhost:11434"  # Default Ollama URL
}

# Create config parser
config_parser = ConfigParser(config)

# Initialize the PDF converter with Ollama integration
converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer(),
    llm_service=config_parser.get_llm_service()
)

# Convert PDF to markdown
pdf_path = str(test_paper)
rendered = converter(pdf_path)

# Extract the markdown text and images
text, _, images = text_from_rendered(rendered)

# Print or save the markdown
print(len(text))

# Optionally save to a file
with open("output_marker.md", "w") as f:
    f.write(text)


# Use MistralAI
(Paid API)

In [None]:
## Source: https://github.com/amayuelas/corpus-automation/blob/main/parse_pdf_mistral.ipynb
import os
import argparse
import base64
from pathlib import Path
from mistralai import Mistral
from mistralai import DocumentURLChunk
import json
import time

def pdf2markdown(pdf_file: Path, output_dir: Path, client: Mistral):
    """Process a single PDF file and save results to output directory.
    
    Args:
        pdf_file: Path to the PDF file to process
        output_dir: Directory where results will be saved
        client: Mistral client instance
    """
    print(f"Processing {pdf_file} ...")

    # Upload PDF file to Mistral's OCR service
    uploaded_file = client.files.upload(
        file={
            "file_name": pdf_file.name,
            "content": pdf_file.read_bytes(),
        },
        purpose="ocr",
    )

    # Get URL for the uploaded file
    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

    # Process PDF with OCR, including embedded images
    pdf_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=True
    )

    # Convert response to JSON format
    response_dict = json.loads(pdf_response.model_dump_json())

    # Save response to JSON file
    output_dir.mkdir(parents=True, exist_ok=True)
    with open(output_dir / "response.json", "w") as f:
        json.dump(response_dict, f)

    # Save images to PNG files
    images_dir = output_dir / "mistral_images"
    images_dir.mkdir(exist_ok=True)
    
    for page in pdf_response.pages:
        for img in page.images:
            # Extract base64 data after the comma
            img_data = img.image_base64.split(',')[1]
            # Decode and save image
            img_bytes = base64.b64decode(img_data)
            with open(images_dir / img.id, "wb") as f:
                f.write(img_bytes)
            
    # Save raw text
    with open(output_dir / "text.txt", "w", encoding="utf-8") as f:
        for page in pdf_response.pages:
            f.write(page.markdown)  # Use markdown instead of text attribute
    
    return pdf_response

In [None]:
from mistralai import ImageURLChunk, TextChunk
def extract_page_struct(image_ocr_markdown: str, page_index: int, output_dir: Path, client: Mistral):
    # Get structured response from model
    chat_response = client.chat.complete(
    model="mistral-large-latest", #"ministral-8b-latest",
    messages=[
        {
            "role": "user",
            "content": [
                TextChunk(
                    text=(
                        f"This is image's OCR in markdown:\n\n{image_ocr_markdown}\n.\n"
                        "Convert this into a sensible structured json response. "
                        "Pay special attention to table content"
                        "The output should be strictly be json with no extra commentary"
                    )
                ),
            ],
        }
    ],
    response_format={"type": "json_object"},
    temperature=0,
    )

    # Parse and return JSON response
    try:
        response_dict = json.loads(chat_response.choices[0].message.content)
        with open(output_dir / f"response_page_{page_index}.json", "w") as f:
            json.dump(response_dict, f, indent=4)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for page {page_index}: {e}")
        response_dict = {}
    return response_dict

In [None]:
RUN_MISTRAL_OCR = False

pdf_response = None
output_dir = Path('mistral_ocr/')

if RUN_MISTRAL_OCR:
    from dotenv import load_dotenv

    load_dotenv(override=True)
    mistral_api_key = os.getenv('MISTRAL_API_KEY')

    if not mistral_api_key:
        raise ValueError("Please set the MISTRAL_API_KEY environment variable.")
    else:
        client = Mistral(api_key=mistral_api_key)
        pdf_response = pdf2markdown(test_paper, output_dir, client)

In [None]:
from mistralai.models import OCRResponse
from IPython.display import Markdown, display

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    Replace image placeholders in markdown with base64-encoded images.

    Args:
        markdown_str: Markdown text containing image placeholders
        images_dict: Dictionary mapping image IDs to base64 strings

    Returns:
        Markdown text with images replaced by base64 data
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> list[str]:
    """
    Combine OCR text and images into a single markdown document.

    Args:
        ocr_response: Response from OCR processing containing text and images

    Returns:
        Combined markdown string with embedded images
    """
    markdowns: list[str] = []
    # Extract images from page
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        # Replace image placeholders with actual images
        enriched_markdown = replace_images_in_markdown(page.markdown, image_data)
        markdowns.append(enriched_markdown)

    return markdowns

# Display combined markdowns and images
if pdf_response:
    markdowns = get_combined_markdown(pdf_response)
    makrdown_str = "\n\n".join(markdowns)
    display(Markdown(makrdown_str))
else:
    markdowns = []

In [None]:
# Iterate the pages (NOT the markdowns!!) because those have the images as binary strings only, so it is useless
if pdf_response:
    for i, page in enumerate(pdf_response.pages):
        print(f"Processing page {i} ...")
        extract_page_struct(page.markdown, i, output_dir, client)

In [None]:
mistral_json = json.loads((output_dir / "response.json").read_text())
[print(x.keys()) for x in mistral_json['pages']]
print(len(mistral_json['pages']))

# Use LLM to Extract Claims

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

# This is a simple example of using the OllamaLLM with a prompt template.
template = """Here is a paragraph with some information : {text_chunk}
This paragraph has one or more claims inside it. Provide me with a list of the claims in the paragraph.
The response should only be one claim per line, no other text.
Each claim should be a precise sentence pointing to a fact. 
Stick as much as possible to the literal text.
Do not infer claims that are not explicitly stated in the text.
Each claim in the list should be separated by a new line and not contain any other text or number.

Claims: 

"""    

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2:latest")
chain = prompt | model


In [None]:
def get_paragraphs(text):
    """
    Splits the text into paragraphs based on newlines.
    """
    paragraphs = []
    for p in text.split('\n'):
        if len(p) > 1 and not p.startswith("Question"):
            if p.startswith("References"):
                break
            paragraphs.append(p)
    return paragraphs


example_paperQA_output = """
Question: How does the seasonality of rotavirus differ between tropical and temperate climates?

The seasonality of rotavirus differs between tropical and temperate climates. In temperate zones, rotavirus is more common in cooler months, with a strong winter peak observed primarily in the Americas (Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf pages 1-2). However, in tropical regions, the pattern is less defined, and autumn/spring peaks are more common.

In tropical climates, rotavirus incidence responds to changes in climate, with the highest number of infections found at the colder and drier times of the year (levy2009seasonalityofrotavirus pages 1-1). Monthly rotavirus incidence is significantly negatively correlated with temperature, rainfall, and relative humidity in the majority of studies reviewed (levy2009seasonalityofrotavirus pages 8-8).

In contrast to temperate areas, where rotavirus incidence often goes to zero in some months, tropical regions experience year-round rotavirus activity with peaks and valleys (levy2009seasonalityofrotavirus pages 6-6). The effect of seasonal changes on rotavirus incidence is not as extreme in the tropics as it is in temperate areas. Less climatic variability exists in tropical climates, which may explain why variations in climatological variables are not large enough to cause the observed effect (levy2009seasonalityofrotavirus pages 6-6).

Overall, the seasonality of rotavirus disease in tropical countries differs from that observed in temperate zones, with tropical regions experiencing year-round activity and responding to changes in climate (levy2009seasonalityofrotavirus pages 8-8).

References

1. (Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf pages 1-2): Jagai, Jyotsna S., et al. "Seasonality of Rotavirus in South Asia: A Meta-Analysis Approach Assessing Associations with Temperature, Precipitation, and Vegetation Index." PLoS ONE, vol. 7, no. 5, 2012, doi:10.1371/journal.pone.0038168.

2. (levy2009seasonalityofrotavirus pages 1-1): K. Levy, A. E Hubbard, and J. N. Eisenberg. Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis. International journal of epidemiology, 38 6:1487-96, Dec 2009. URL: https://doi.org/10.1093/ije/dyn260, doi:10.1093/ije/dyn260. This article has 265 citations and is from a highest quality peer-reviewed journal.

3. (levy2009seasonalityofrotavirus pages 6-6): K. Levy, A. E Hubbard, and J. N. Eisenberg. Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis. International journal of epidemiology, 38 6:1487-96, Dec 2009. URL: https://doi.org/10.1093/ije/dyn260, doi:10.1093/ije/dyn260. This article has 265 citations and is from a highest quality peer-reviewed journal.

4. (levy2009seasonalityofrotavirus pages 8-8): K. Levy, A. E Hubbard, and J. N. Eisenberg. Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis. International journal of epidemiology, 38 6:1487-96, Dec 2009. URL: https://doi.org/10.1093/ije/dyn260, doi:10.1093/ije/dyn260. This article has 265 citations and is from a highest quality peer-reviewed journal.

"""
text_chunks = get_paragraphs(example_paperQA_output)
assert len(text_chunks) == 4

In [None]:
claims = []
for text in text_chunks:
    claims.append(chain.invoke({"text_chunk": text}))
claims

In [None]:
def format_claim(claim_str):
    """
    Formats the claims into a list.
    """
    try:
        claims = claim_str.split("\n\n")
    except Exception as e:
        print(f"Error splitting claims: {e}")
        return []
    return claims
    

claim_list = [format_claim(claim) for claim in claims]
for chunk, claims in zip(text_chunks, claim_list):
    print(f"Text: {chunk}\n")
    [print(c) for c in claims]
    print("--------")

In [None]:
import json
paper_claims = {}

OBTAIN_PDF_CLAIMS = False

if OBTAIN_PDF_CLAIMS:
    for page_number, page_content in parsed_text.content.items():
        page_paragraphs = get_paragraphs(page_content)
        print(f"----- Found {len(page_paragraphs)} paragraphs in page {page_number} -----")
        for pi, paragraph in enumerate(page_paragraphs):
            if len(paragraph.split()) < 10:
                claim_list = []
            else:
                claims = chain.invoke({"text_chunk": paragraph})
                claim_list = format_claim(claims)
            print(f"Page {page_number} - Paragraph {pi} has {len(claim_list)} claims")
            paper_claims[f"{page_number}_{pi}"] = {"text": paragraph[:500], "claims": claim_list}

    # Save the claims to a JSON file
    with open("paper_claims.json", "w") as f:
        json.dump(paper_claims, f, indent=4)
