In [1]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print("[INFO] file doesn't exists and downloading starts!")

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as file:
            file.write(response.content)

        print(f"[INFO] the file has been downloaded and saved as {pdf_path}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")

In [2]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formating on the text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41, 
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, 
                                "text": text})
    return pages_and_texts
pages_and_texts = open_and_read_pdf(pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import random

random.sample(pages_and_texts, k=1)

[{'page_number': 961,
  'page_char_count': 1384,
  'page_word_count': 233,
  'page_sentence_count_raw': 11,
  'page_token_count': 346.0,
  'text': 'Macronutrient Needs  The composition of macronutrients in the diet is a key factor in  maximizing performance for athletes. Carbohydrates are an  important fuel source for the brain and muscle during exercise.  \xa0Carbohydrate storage in the liver and muscle cells are relatively  limited and therefore it is important for athletes to consume enough  carbohydrates from their diet. Carbohydrate needs should increase  about 3-10 g/kg/day depending on the type of training or  competition.3 See Table 16.1 “Daily Needs for Carbohydrate Fuel” for  carbohydrate needs for athletes depending on the intensity of the  exercise.  Table 16.1 Daily Needs for Carbohydrate Fuel  Activity  Level  Example of Exercise  Increase of  Carbohydrate (g/kg of  athlete’s body weight/ day)  Light  Low intensity or skill based  activities  3-5  Moderate Moderate exerci

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


In [7]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("this is the first sentence. this is the second one.")

assert len(list(doc.sents)) == 2
list(doc.sents)

[this is the first sentence., this is the second one.]

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] =  [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_space"] = len(item["sentences"])
    

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
random.sample(pages_and_texts, k=1)

[{'page_number': 951,
  'page_char_count': 1274,
  'page_word_count': 239,
  'page_sentence_count_raw': 10,
  'page_token_count': 318.5,
  'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Physical Activity Duration and Fuel Use  The respiratory system plays a vital role in the uptake and delivery  of oxygen to muscle cells throughout the body. Oxygen is inhaled  by the lungs and transferred from the lungs to the blood where  the cardiovascular system circulates the oxygen-rich blood to the  muscles. \xa0The oxygen is then taken up by the muscles and can be  used to generate ATP. When the body is at rest, the heart and  lungs are able to supply the muscles with adequate amounts of  oxygen to meet the aerobic metabolism energy needs. However,  during physical activity your muscles energy and oxygen needs are  increased. In order to provide more oxygen to the muscle cells, your  heart rate and breathing rate will increase. The amount of oxygen  that is delivered to the tissues via the 

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_space
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [13]:
num_sentence_chunk_size = 10

def split_list(input_list: list, 
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i: i+slice_size] for i in range(0, len(input_list), slice_size)]

split_list(list(range(25)))

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [None]:
for item in tqdm(pages_and_texts):
    item['sentences_chunks'] = split_list(item["sentences"], 
                                          num_sentence_chunk_size)
    item['num_chunks'] = len(item["sentences_chunks"])
    