## Using PaperQA 

In [18]:
from pathlib import Path
from paperqa.readers import parse_pdf_to_pages

PAPERS_DIR = Path.home() / "papers_minedd"

test_paper = PAPERS_DIR / "Seasonality of rotavirus disease in the tropics_ a systematic review and meta-analysis.pdf"

parsed_text = parse_pdf_to_pages(str(test_paper))
parsed_text

ParsedText(content={'1': 'Published by Oxford University Press on behalf of the International Epidemiological Association          International Journal of Epidemiology 2009;38:1487–1496\n\n  The Author 2008; all rights reserved. Advance Access publication 4 December 2008                                                  doi:10.1093/ije/dyn260\n\n\nSYSTEMATIC REVIEWS\nSeasonality of rotavirus disease in the tropics:\na systematic review and meta-analysis\n\nKaren Levy,1,2* Alan E Hubbard3 and Joseph NS Eisenberg2\n\n\n\n                   Accepted   4 November 2008\n\n                  Background To date little conclusive evidence exists on the seasonality of rotavirus\n                                     incidence in the tropics. We present a systematic review and meta-\n                                       analysis on the seasonal epidemiology of rotavirus in the tropics,\n                                     including 26 studies reporting continuous monthly rotavirus inci-\n      

In [19]:
for k, v in parsed_text.content.items():
    print(f"Page {k}:")
    print(v[:100].replace('\n', ' '))
    print("--------")

Page 1:
Published by Oxford University Press on behalf of the International Epidemiological Association     
--------
Page 2:
1488   INTERNATIONAL JOURNAL OF EPIDEMIOLOGY     Rotavirus  is believed to be spread predominantly  
--------
Page 3:
                                                      SEASONALITY OF ROTAVIRUS DISEASE   1489   posi
--------
Page 4:
  1490   INTERNATIONAL JOURNAL OF EPIDEMIOLOGY                                in                    
--------
Page 5:
                                                      SEASONALITY OF ROTAVIRUS DISEASE   1491       
--------
Page 6:
1492   INTERNATIONAL JOURNAL OF EPIDEMIOLOGY   Table 2 Results of pooled analyses                   
--------
Page 7:
                                                      SEASONALITY OF ROTAVIRUS DISEASE   1493       
--------
Page 8:
1494   INTERNATIONAL JOURNAL OF EPIDEMIOLOGY   symptoms of upper respiratory tract infection before 
--------
Page 9:
                                                

## Using PyMuPDF Directly

In [20]:
import pymupdf4llm

# Table Strategies: https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
md_text = pymupdf4llm.to_markdown(test_paper, 
                                  page_chunks=False, 
                                  table_strategy="lines", 
                                  embed_images=False
                                  )
# Write the text to some file in UTF8-encoding
Path("output.md").write_bytes(md_text.encode())

36245

In [21]:
len(md_text), md_text

(36044,
 'Published by Oxford University Press on behalf of the International Epidemiological Association International Journal of Epidemiology 2009;38:1487–1496\n\n� The Author 2008; all rights reserved. Advance Access publication 4 December 2008 doi:10.1093/ije/dyn260\n### SYSTEMATIC REVIEWS\n# Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis\n\nKaren Levy, [1,2] - Alan E Hubbard [3] and Joseph NS Eisenberg [2]\n\nAccepted 4 November 2008\n\nBackground To date little conclusive evidence exists on the seasonality of rotavirus\nincidence in the tropics. We present a systematic review and metaanalysis on the seasonal epidemiology of rotavirus in the tropics,\nincluding 26 studies reporting continuous monthly rotavirus incidence for which corresponding climatological data was available.\n\nMethods Using linear regression models that account for serial correlation\nbetween months, monthly rotavirus incidence was significantly\nnegatively correlated wi

## Use LLM to Extract Claims

In [57]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

# This is a simple example of using the OllamaLLM with a prompt template.
template = """Here is a paragraph with some information : {text_chunk}
This paragraph has one or more claims inside it. Provide me with a list of the claims in the paragraph.
The response should only be one claim per line, no other text.
Each claim should be a precise sentence pointing to a fact. 
Stick as much as possible to the literal text.
Do not infer claims that are not explicitly stated in the text.
Each claim in the list should be separated by a new line and not contain any other text or number.

Claims: 

"""    

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2:latest")
chain = prompt | model


In [58]:
def get_paragraphs(text):
    """
    Splits the text into paragraphs based on newlines.
    """
    paragraphs = []
    for p in text.split('\n'):
        if len(p) > 1 and not p.startswith("Question"):
            if p.startswith("References"):
                break
            paragraphs.append(p)
    return paragraphs


example_paperQA_output = """
Question: How does the seasonality of rotavirus differ between tropical and temperate climates?

The seasonality of rotavirus differs between tropical and temperate climates. In temperate zones, rotavirus is more common in cooler months, with a strong winter peak observed primarily in the Americas (Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf pages 1-2). However, in tropical regions, the pattern is less defined, and autumn/spring peaks are more common.

In tropical climates, rotavirus incidence responds to changes in climate, with the highest number of infections found at the colder and drier times of the year (levy2009seasonalityofrotavirus pages 1-1). Monthly rotavirus incidence is significantly negatively correlated with temperature, rainfall, and relative humidity in the majority of studies reviewed (levy2009seasonalityofrotavirus pages 8-8).

In contrast to temperate areas, where rotavirus incidence often goes to zero in some months, tropical regions experience year-round rotavirus activity with peaks and valleys (levy2009seasonalityofrotavirus pages 6-6). The effect of seasonal changes on rotavirus incidence is not as extreme in the tropics as it is in temperate areas. Less climatic variability exists in tropical climates, which may explain why variations in climatological variables are not large enough to cause the observed effect (levy2009seasonalityofrotavirus pages 6-6).

Overall, the seasonality of rotavirus disease in tropical countries differs from that observed in temperate zones, with tropical regions experiencing year-round activity and responding to changes in climate (levy2009seasonalityofrotavirus pages 8-8).

References

1. (Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf pages 1-2): Jagai, Jyotsna S., et al. "Seasonality of Rotavirus in South Asia: A Meta-Analysis Approach Assessing Associations with Temperature, Precipitation, and Vegetation Index." PLoS ONE, vol. 7, no. 5, 2012, doi:10.1371/journal.pone.0038168.

2. (levy2009seasonalityofrotavirus pages 1-1): K. Levy, A. E Hubbard, and J. N. Eisenberg. Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis. International journal of epidemiology, 38 6:1487-96, Dec 2009. URL: https://doi.org/10.1093/ije/dyn260, doi:10.1093/ije/dyn260. This article has 265 citations and is from a highest quality peer-reviewed journal.

3. (levy2009seasonalityofrotavirus pages 6-6): K. Levy, A. E Hubbard, and J. N. Eisenberg. Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis. International journal of epidemiology, 38 6:1487-96, Dec 2009. URL: https://doi.org/10.1093/ije/dyn260, doi:10.1093/ije/dyn260. This article has 265 citations and is from a highest quality peer-reviewed journal.

4. (levy2009seasonalityofrotavirus pages 8-8): K. Levy, A. E Hubbard, and J. N. Eisenberg. Seasonality of rotavirus disease in the tropics: a systematic review and meta-analysis. International journal of epidemiology, 38 6:1487-96, Dec 2009. URL: https://doi.org/10.1093/ije/dyn260, doi:10.1093/ije/dyn260. This article has 265 citations and is from a highest quality peer-reviewed journal.

"""
text_chunks = get_paragraphs(example_paperQA_output)
assert len(text_chunks) == 4

In [59]:
claims = []
for text in text_chunks:
    claims.append(chain.invoke({"text_chunk": text}))
claims

['In temperate zones, rotavirus is more common in cooler months.\n\nThe seasonality of rotavirus differs between tropical and temperate climates.\n\nRotavirus is more common primarily in the Americas during winter.\n\nIn tropical regions, the pattern of rotavirus seasonality is less defined.\n\nAutumn/spring peaks are more common in tropical regions than in temperate zones.',
 'In tropical climates, rotavirus incidence responds to changes in climate.\n\nThe highest number of infections found at the colder and drier times of the year.\n\nMonthly rotavirus incidence is significantly negatively correlated with temperature.\n\nMonthly rotavirus incidence is significantly negatively correlated with rainfall.\n\nMonthly rotavirus incidence is significantly negatively correlated with relative humidity.',
 'In tropical regions, rotavirus incidence occurs year-round.\n\nRotavirus incidence peaks and valleys occur in tropical regions.\n\nThe effect of seasonal changes on rotavirus incidence is n

In [60]:
def format_claim(claim_str):
    """
    Formats the claims into a list.
    """
    try:
        claims = claim_str.split("\n\n")
    except Exception as e:
        print(f"Error splitting claims: {e}")
        return []
    return claims
    

claim_list = [format_claim(claim) for claim in claims]
for chunk, claims in zip(text_chunks, claim_list):
    print(f"Text: {chunk}\n")
    [print(c) for c in claims]
    print("--------")

Text: The seasonality of rotavirus differs between tropical and temperate climates. In temperate zones, rotavirus is more common in cooler months, with a strong winter peak observed primarily in the Americas (Seasonality of Rotavirus in South Asia_ A Meta-Analysis Approach Assessing Associations with Temperature_ Precipitation_ and Vegetation Index.pdf pages 1-2). However, in tropical regions, the pattern is less defined, and autumn/spring peaks are more common.

In temperate zones, rotavirus is more common in cooler months.
The seasonality of rotavirus differs between tropical and temperate climates.
Rotavirus is more common primarily in the Americas during winter.
In tropical regions, the pattern of rotavirus seasonality is less defined.
Autumn/spring peaks are more common in tropical regions than in temperate zones.
--------
Text: In tropical climates, rotavirus incidence responds to changes in climate, with the highest number of infections found at the colder and drier times of the

In [61]:
import json
paper_claims = {}

for page_number, page_content in parsed_text.content.items():
    page_paragraphs = get_paragraphs(page_content)
    print(f"----- Found {len(page_paragraphs)} paragraphs in page {page_number} -----")
    for pi, paragraph in enumerate(page_paragraphs):
        if len(paragraph.split()) < 10:
            claim_list = []
        else:
            claims = chain.invoke({"text_chunk": paragraph})
            claim_list = format_claim(claims)
        print(f"Page {page_number} - Paragraph {pi} has {len(claim_list)} claims")
        paper_claims[f"{page_number}_{pi}"] = {"text": paragraph[:500], "claims": claim_list}

# Save the claims to a JSON file
with open("paper_claims.json", "w") as f:
    json.dump(paper_claims, f, indent=4)


----- Found 48 paragraphs in page 1 -----
Page 1 - Paragraph 0 has 1 claims
Page 1 - Paragraph 1 has 1 claims
Page 1 - Paragraph 2 has 1 claims
Page 1 - Paragraph 3 has 1 claims
Page 1 - Paragraph 4 has 1 claims
Page 1 - Paragraph 5 has 3 claims
Page 1 - Paragraph 6 has 1 claims
Page 1 - Paragraph 7 has 2 claims
Page 1 - Paragraph 8 has 1 claims
Page 1 - Paragraph 9 has 1 claims
Page 1 - Paragraph 10 has 1 claims
Page 1 - Paragraph 11 has 2 claims
Page 1 - Paragraph 12 has 3 claims
Page 1 - Paragraph 13 has 1 claims
Page 1 - Paragraph 14 has 2 claims
Page 1 - Paragraph 15 has 1 claims
Page 1 - Paragraph 16 has 1 claims
Page 1 - Paragraph 17 has 1 claims
Page 1 - Paragraph 18 has 1 claims
Page 1 - Paragraph 19 has 4 claims
Page 1 - Paragraph 20 has 1 claims
Page 1 - Paragraph 21 has 2 claims
Page 1 - Paragraph 22 has 2 claims
Page 1 - Paragraph 23 has 1 claims
Page 1 - Paragraph 24 has 2 claims
Page 1 - Paragraph 25 has 1 claims
Page 1 - Paragraph 26 has 1 claims
Page 1 - Paragraph 27 h