In [17]:
import torch

### Import PDF Document.

In [18]:
import os
import requests

# Get PDF Document Path
pdf_path = "human-nutrition-text.pdf"

# Download PDF
if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading...")

    # Enter URL of the pdf
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # The local file name to save the downloaded file
    filename = pdf_path

    # Send a GET request to url
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to downlaod the file. Status code: {response.status_code}")


else:
    print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [19]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text:str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here.
    return cleaned_text


def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
        
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

1208it [00:02, 499.60it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [20]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 76,
  'page_char_count': 1222,
  'page_word_count': 205,
  'page_sentence_count_raw': 10,
  'page_token_count': 305.5,
  'text': '“Histology  Small  Intestines” by  OpenStax  College / CC  BY 3.0  area to maximize nutrient absorption. The surface area is increased  by folds, villi, and microvilli. Digested nutrients are absorbed into  either capillaries or lymphatic vessels contained within each  microvillus.  The small intestine is perfectly structured for maximizing  nutrient absorption. Its surface area is greater than 200 square  meters, which is about the size of a tennis court. The large surface  area is due to the multiple levels of folding. The internal tissue  of the small intestine is covered in villi, which are tiny finger-like  projections that are covered with even smaller projections, called  microvilli (Figure 2.8 “Structure of the Small Intestine”). The  digested nutrients pass through the absorptive cells of the intestine  via diffusion or special tran

In [21]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [22]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


### Further text processing (splitting pages into sentances)
Two ways to do this:

1. We've done this by splitting on `. `.
2. We can do this with a NLP library such as spaCy and nltk.

In [23]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Create documemt instance as an example.
doc = nlp("This is a sentence. This another sentence. I like elephants.")
assert len(list(doc.sents)) == 3

# Print out our sentences split
list(doc.sents)

[This is a sentence., This another sentence., I like elephants.]

In [24]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 863,
 'page_word_count': 136,
 'page_sentence_count_raw': 8,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5.\xa0Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.  \xa0https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitami

In [25]:
for item in tqdm(pages_and_texts):
    item['sentences'] = list(nlp(item['text']).sents)

    # Make sure all sentences are strings (the default type is spacy datatype)
    item['sentences'] = [str(sentence) for sentence in item['sentences']]

    # Count the sentences
    item['page_sentence_count_spacy'] = len(item['sentences'])

100%|██████████| 1208/1208 [00:03<00:00, 320.98it/s]


In [26]:
random.sample(pages_and_texts, k=1)

[{'page_number': 631,
  'page_char_count': 1458,
  'page_word_count': 251,
  'page_sentence_count_raw': 10,
  'page_token_count': 364.5,
  'text': 'Phosphorus  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Phosphorus’s Functional Role  Phosphorus is present in our bodies as part of a chemical group  called a phosphate group. These phosphate groups are essential  as a structural component of cell membranes (as phospholipids),  DNA and RNA, energy production (ATP), and regulation of acid- base homeostasis. Phosphorus however is mostly associated with  calcium as a part of the mineral structure of bones and teeth. \xa0Blood  phosphorus levels are not controlled as strictly as calcium so the  PTH stimulates renal excretion of phosphate so that it does not  accumulate to toxic levels.  Dietary Reference Intakes for Phosphorus  In comparison to calcium, most Americans are not at risk for having  a phosphate deficiency. Phosphate is pres

In [27]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [30]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a funtion to split lists of texts recursively into chunk size
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]
def split_list(input_list: list[str],
               splice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+splice_size] for i in range(0, len(input_list), splice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [31]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         splice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 99886.04it/s]


In [41]:
random.sample(pages_and_texts, k=1)

[{'page_number': 837,
  'page_char_count': 1854,
  'page_word_count': 328,
  'page_sentence_count_raw': 21,
  'page_token_count': 463.5,
  'text': 'Breast Milk  Bottle Formula  Antibodies and lactoferrin in breast  milk protect infants.  Formula does not contain  immunoprotective factors.  The iron in breast milk is absorbed  more easily. Because the iron is bound  to lactoferrin, it is not available for  bacteria in the gut\xa0 to use as a growth  factor.  Formula contains more iron  than breast milk, but it is not  absorbed as easily, and the iron  is a growth factor for pathogenic  microbes.  The feces that babies produce do not  smell because breastfed infants have  different bacteria in the gut.  The feces that bottle-fed infants  produce tends to have a  foul-smelling odor.  Breast milk is always available and is  always at the correct temperature.  Formula must be prepared,  refrigerated for storage, and  warmed before it is given to an  infant.  Breastfed infants are less likel

In [42]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [47]:
import re

# Split each into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any capital letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4   # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)


100%|██████████| 1208/1208 [00:00<00:00, 17681.92it/s]


1843

In [50]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 671,
  'sentence_chunk': '2003). Zinc deficiency.\xa0British Medical Journal,\xa0326(7386), 409–410.doi: 10.1136/ bmj.326.7386.409. Accessed October 2, 2011. http://www.ncbi.nlm.nih.gov/pmc/articles/ PMC1125304/?tool=pmcentrez. Zinc | 671',
  'chunk_char_count': 206,
  'chunk_word_count': 17,
  'chunk_token_count': 51.5}]

In [51]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [52]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25


### Filter chunks of text for short chunks

These chunks may not contain much useful information.

In [54]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 24.25 | Text: These activities are available in the web-based textbook and not available in the Magnesium | 643
Chunk token count: 16.5 | Text: Table 4.6 Sweeteners Carbohydrates and Personal Diet Choices | 281
Chunk token count: 11.25 | Text: Accessed March 17, 2011. 212 | Water Concerns
Chunk token count: 28.75 | Text: Bouayed, J. and T. Bohn. (2010). Exogenous Antioxidants—Double-Edged Swords in Cellular Redox MyPlate Planner | 753
Chunk token count: 13.25 | Text: https://doi.org/10.1186/ 1743-7075-4-24. Sulfur | 637


In [55]:
# Filter out dataframe for rows with under 30 tokens
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [57]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 1073,
  'sentence_chunk': 'Image by Jennifer Draper / CC BY 4.0 Image by CDC / Unsplash License Intuitive Eating Intuitive eating is a non-diet approach to eating that promotes a Calories In Versus Calories Out | 1073',
  'chunk_char_count': 190,
  'chunk_word_count': 34,
  'chunk_token_count': 47.5}]

In [64]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")


# Create a list of sentences
sentences = ["The sentence transformer library provides an easy way to create embeddings.",
             "Sentences can be embedded one by one or in a list.",
             "I like dogs!"]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")

RuntimeError: Numpy is not available

In [69]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())


2.5.1+cpu
False


In [70]:
import numpy as np
import torch

print("NumPy Version:", np.__version__)
print("Torch Version:", torch.__version__)


NumPy Version: 2.1.3
Torch Version: 2.5.1+cpu


In [74]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
sentences = ["This is a test sentence."]
embedding = model.encode(sentences)
print("Embedding generated successfully:", embedding)


RuntimeError: Numpy is not available