In [1]:
import torch

In [2]:
!nvidia-smi

Mon Sep 23 17:27:12 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.44                 Driver Version: 552.44         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650      WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   46C    P8              1W /   30W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os 
import requests 

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist downloading")

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    filename = pdf_path
    
    response = requests.get(url)

    if response.status_code == 200:
        with open(filename,'wb') as file:
            file.write(response.content)
        
        print(f"[INFO] the file has been downloaded and saved as {filename}")
    
    else:
        print(f"[INFO] failed to download the file.Status code : {response.status_code}" )

else:
    print("File exists")

File exists


In [4]:
import fitz 
from tqdm import tqdm

def text_formatter(text : str) -> str:
    """Performs minor formatiing on text"""
    cleaned_text = text.replace("\n", " ").strip()
    
    return cleaned_text

def open_and_read_pdf(pdf_path : str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number , page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number" : page_number - 41,
                                "page_char_count" : len(text),
                                "page_word_count" : len(text.split(" ")),
                                "page_sentence_count" : len(text.split(" ")),
                                "page_sentence_count_raw" : len(text.split(". ")),
                                "page_token_counts" : len(text)/4,
                                "text" : text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]
        

1208it [00:01, 736.30it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_counts': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_counts': 0.0,
  'text': ''}]

In [5]:
import random 
random.sample(pages_and_texts,k=3)

[{'page_number': 1077,
  'page_char_count': 1949,
  'page_word_count': 322,
  'page_sentence_count': 322,
  'page_sentence_count_raw': 20,
  'page_token_counts': 487.25,
  'text': 'esophagus and cause irritation. It is estimated that GERD affects 25  to 35 percent of the US population. An analysis of several studies  published in the August 2005 issue of Annals of Internal Medicine  concludes that GERD is much more prevalent in people who are  obese.1 The most common GERD symptom is heartburn, but people  with GERD may also experience regurgitation (flow of the stomach’s  acidic contents into the mouth), frequent coughing, and trouble  swallowing.  There are other causative factors of GERD that may be separate  from or intertwined with obesity. The sphincter that separates the  stomach’s internal contents from the esophagus often does not  function properly and acidic gastric contents seep upward.  Sometimes the peristaltic contractions of the esophagus are also  sluggish and compromis

In [6]:
import pandas as pd 
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_sentence_count_raw,page_token_counts,text
0,-41,29,4,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,1,0.0,
2,-39,320,54,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [7]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_sentence_count_raw,page_token_counts
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,199.5,10.52,287.0
std,348.86,560.38,95.83,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,216.0,10.0,307.88
75%,864.25,1603.5,272.0,272.0,15.0,400.88
max,1166.0,2308.0,430.0,430.0,39.0,577.0


In [8]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer
nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. How are you doing. I like the open space")
assert len(list(doc.sents)) == 3

list(doc.sents)

[This is a sentence., How are you doing., I like the open space]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])
    

  0%|          | 0/1208 [00:00<?, ?it/s]

100%|██████████| 1208/1208 [00:03<00:00, 370.97it/s]


In [10]:
random.sample(pages_and_texts,k=1)

[{'page_number': -35,
  'page_char_count': 1037,
  'page_word_count': 191,
  'page_sentence_count': 191,
  'page_sentence_count_raw': 1,
  'page_token_counts': 259.25,
  'text': 'The Cardiovascular System  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  82  Central Nervous System  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  94  The Respiratory System  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  99  The Endocrine System  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  106  The Urinary System  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  110  The Muscular System  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  117  The Skeletal System  University of Haw

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_sentence_count_raw,page_token_counts,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,430.0,39.0,577.0,28.0


In [12]:
num_sentence_chunk_size = 10

# create a function to split lists of texts recursively into chunk size

def split_list(input_list : list[str], slice_size : int = num_sentence_chunk_size) -> list[list[str]]:
    
    return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [13]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 403465.46it/s]


In [14]:
random.sample(pages_and_texts,k=3)

[{'page_number': 935,
  'page_char_count': 956,
  'page_word_count': 166,
  'page_sentence_count': 166,
  'page_sentence_count_raw': 7,
  'page_token_counts': 239.0,
  'text': 'due to the overload principle that our bodies will adapt to with  continuous repetition. For example, if you run a mile everyday for  a week, in a few weeks you would be able to run further and likely  faster.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this

In [15]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(1)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_sentence_count_raw,page_token_counts,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,199.5,10.5,287.0,10.3,1.5
std,348.9,560.4,95.8,95.8,6.5,140.1,6.3,0.6
min,-41.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,260.8,762.0,134.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,216.0,10.0,307.9,10.0,1.0
75%,864.2,1603.5,272.0,272.0,15.0,400.9,15.0,2.0
max,1166.0,2308.0,430.0,430.0,39.0,577.0,28.0,3.0


In [29]:
import re 

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1', joined_sentence_chunk)
        # joined_sentence_chunk = 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 21141.46it/s]


1843

In [30]:
random.sample(pages_and_chunks,k=1)

[{'page_number': 602,
  'sentence_chunk': 'Cruciferous Vegetables and Human Cancer Risk: Epidemiologic Evidence and Mechanistic Basis. Pharmacological Research : The Official Journal of the Italian Pharmacological Society, 55(3), 224–236. https://doi.org/10.1016/j.phrs.2007.01.009 Kozłowska, A., & Szostak-Wegierek, D. (2014). Flavonoids—Food sources and health benefits. Roczniki Panstwowego Zakladu Higieny, 65(2), 79–85. Patisaul, H. B., & Jefferson, W. (2010). The pros and cons of phytoestrogens. Frontiers in Neuroendocrinology, 31(4), 400–419. https://doi.org/10.1016/j.yfrne.2010.03.003 Phenolic Acids—An overview | ScienceDirect Topics. (n.d.).',
  'chunk_char_count': 611,
  'chunk_word_count': 68,
  'chunk_token_count': 152.75}]

In [31]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(1)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.4,734.1,112.7,183.5
std,347.8,447.5,71.2,111.9
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.8
50%,586.0,745.0,115.0,186.2
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [32]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25


In [35]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunks token count : {row[1]["chunk_token_count"]} | Text : {row[1]["sentence_chunk"]}")

Chunks token count : 9.0 | Text : 1088 | Nutrition, Health and Disease
Chunks token count : 18.0 | Text : Updated July 24, 2017. Accessed April 15, 2018. 1112 | Threats to Health
Chunks token count : 21.0 | Text : PART XII CHAPTER 12. NUTRITION APPLICATIONS Chapter 12. Nutrition Applications | 705
Chunks token count : 10.5 | Text : The Major Types of Foodborne Illness | 993
Chunks token count : 16.0 | Text : Table 14.2 Micronutrient Levels during Puberty 886 | Adolescence


In [37]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient = "records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [40]:
random.sample(pages_and_chunks_over_min_token_len,k=1)

[{'page_number': 1144,
  'sentence_chunk': 'that regulate this profession.1 Go to https://www.cdrnet.org/ certifications to learn more. Working in Nutrition Registered dietitians (RDs)/registered dietitians nutritionist (RDNs) and nutritionists plan food and nutrition programs, promote healthy eating habits, and recommend dietary modifications based on the needs of individuals or groups. For example, an RD/RDN might teach a patient with hypertension how to follow the DASH diet and reduce their sodium intake. Nutrition-related careers can be extremely varied. Some individuals work in the government, while others are solely in the private sectors (i.e., private practice, worksite wellness, hospitals, outpatient clinics, etc). Some jobs in nutrition focus on working with elite athletes, while others provide guidance to patients with long-term, life-threatening diseases. But no matter the circumstance or the clientele, working in the field of diet and nutrition focuses on helping people im