In [1]:
import os
import requests

pdf_path = "path-book.pdf"

if not os.path.exists(pdf_path):
  print(f"[INFO] File Doesnt Exist, Downloading....")

  url = 'https://pressbooks.oer.hawaii.edu/humannutrition2/'
  filename = pdf_path

  response = requests.get(url)

  if response.status_code == 200:
    with open(filename, "wb") as file:
          file.write(response.content)
    print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")

else:
  print(f"File {pdf_path} exists.")

[INFO] File Doesnt Exist, Downloading....
The file has been downloaded and saved as path-book.pdf


In [None]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  
        text = page.get_text()  
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

  from .autonotebook import tqdm as notebook_tqdm
32it [00:00, 1030.49it/s]


[{'page_number': -41,
  'page_char_count': 100,
  'page_word_count': 23,
  'page_sentence_count_raw': 1,
  'page_token_count': 25.0,
  'text': 'Skip to content [image] Toggle Menu Primary Navigation •  Home •  Read •  Sign in •  Search in book:'},
 {'page_number': -40,
  'page_char_count': 145,
  'page_word_count': 23,
  'page_sentence_count_raw': 2,
  'page_token_count': 36.25,
  'text': 'Search Want to create or adapt books like this? Learn more about how Pressbooks supports open publishing practices.  Book Title: Human Nutrition:'}]

In [3]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': -26,
  'page_char_count': 907,
  'page_word_count': 134,
  'page_sentence_count_raw': 11,
  'page_token_count': 226.75,
  'text': '2. The Atom University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 3. Weight Management University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 4. Factors Affecting Energy Intake University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 5. Factors Affecting Energy Expenditure University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 6. Dietary, Behavioral, and Physical Activity Recommendations for Weight Management University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 12. IX. Chapter 9. Vitamins 1. Introduction University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 2. Fat-Soluble V

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,100,23,1,25.0,Skip to content [image] Toggle Menu Primary Na...
1,-40,145,23,2,36.25,Search Want to create or adapt books like this...
2,-39,574,93,3,143.5,2020 Edition Authors: University of Hawai‘i at...
3,-38,0,1,1,0.0,
4,-37,38,5,1,9.5,Creative Commons Attribution Read Book


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,32.0,32.0,32.0,32.0,32.0
mean,-25.5,618.81,93.91,7.09,154.7
std,9.38,371.78,56.18,4.62,92.94
min,-41.0,0.0,1.0,1.0,0.0
25%,-33.25,133.75,23.0,1.75,33.44
50%,-25.5,838.5,126.5,9.5,209.62
75%,-17.75,907.0,137.25,11.0,226.75
max,-10.0,961.0,150.0,13.0,240.25


In [6]:
from spacy.lang.en import English 

nlp = English()

nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2


list(doc.sents)

[This is a sentence., This another sentence.]

In [7]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 32/32 [00:00<00:00, 1483.54it/s]


In [8]:
random.sample(pages_and_texts, k=1)

[{'page_number': -21,
  'page_char_count': 880,
  'page_word_count': 131,
  'page_sentence_count_raw': 13,
  'page_token_count': 220.0,
  'text': 'Program 7. Understanding the Bigger Picture of Dietary Guidelines University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 16. XIII. Chapter 13. Lifespan Nutrition From Pregnancy to the Toddler Years 1. Introduction University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 2. Pregnancy University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 3. Infancy University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 4. Toddler Years University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 17. XIV. Chapter 14. Lifespan Nutrition During Childhood and Adolescence 1. Introduction University of Hawai‘i at Mānoa Food Science and Human Nutrition 

In [9]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,32.0,32.0,32.0,32.0,32.0,32.0
mean,-25.5,618.81,93.91,7.09,154.7,7.03
std,9.38,371.78,56.18,4.62,92.94,4.6
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,-33.25,133.75,23.0,1.75,33.44,1.75
50%,-25.5,838.5,126.5,9.5,209.62,9.0
75%,-17.75,907.0,137.25,11.0,226.75,11.0
max,-10.0,961.0,150.0,13.0,240.25,13.0
