In [1]:
###Import PDF Document

import os
import requests

#Getting Path of the PDF doc
pdf_path = "human-nutrition-text.pdf"

#Download PDF
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, let me download for You..")

    #Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # The Local filename to save the downloaded file
    filename = pdf_path

    # Sending a Get Request to the URL
    response = requests.get(url)

    #Check if the request was successful
    if response.status_code == 200:
        # Open the file and save it 
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded ans saved as {filename}")
    else:
        print(f"[INFO]  Failed to download the file. Status code: {response.status_code}")

else:
    print(f"File {pdf_path} exists.") 

File human-nutrition-text.pdf exists.


In [2]:
import fitz #requires: pip install PyMuPDF, 
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
     """Performs minor formatting on text."""
     cleaned_text = text.replace("\n", " ").strip()

     return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 character
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import random

random.sample(pages_and_texts, k=5)

[{'page_number': 387,
  'page_char_count': 1328,
  'page_word_count': 248,
  'page_sentence_count_raw': 11,
  'page_token_count': 332.0,
  'text': 'PDB 1o9x EBI  by Jawahar  Swaminatha n and MSD  staff at the  European  Bioinformati cs Institute /  Public  Domain The  butterfly-sha ped protein,  albumin, has  many  functions in  the body  including  maintaining  fluid and  acid-base  balance and  transporting  molecules.  If too much water in the blood suddenly moves into a tissue, the  results are swelling and, potentially, cell death. Water always flows  from an area of high concentration to one of a low concentration. As  a result, water moves toward areas that have higher concentrations  of other solutes, such as proteins and glucose. To keep the water  evenly distributed between blood and cells, proteins continuously  circulate at high concentrations in the blood. The most abundant  protein in blood is the butterfly-shaped protein known as albumin.  Albumin’s presence in the blood

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)   #A DataFrame is a data structure that organizes data into a 2-dimensional table of rows and columns, much like a spreadsheet.
df.head()

#df.head(): This line calls the head() method on the DataFrame df. The head() method is used to display the first few rows of the DataFrame.
#By default, it displays the first 5 rows

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


In [6]:
### Splitting pages into sentences.

In [7]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeliner, 
nlp.add_pipe("sentencizer")

# create a document instance as an example
doc = nlp("Sentence 1. sentence 2. sentence 3")
assert len(list(doc.sents)) == 3

# Print out our sentences split
list(doc.sents)

[Sentence 1., sentence 2., sentence 3]

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 985,
  'page_char_count': 67,
  'page_word_count': 15,
  'page_sentence_count_raw': 3,
  'page_token_count': 16.75,
  'text': 'PART\xa0XVII  CHAPTER 17. FOOD SAFETY  Chapter 17. Food Safety  |  985',
  'sentences': ['PART\xa0XVII  CHAPTER 17.',
   'FOOD SAFETY  Chapter 17.',
   'Food Safety  |  985'],
  'page_sentence_count_spacy': 3}]

In [10]:
### listing list of dictionaries into a DataFrame for getting some stats.

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32
std,348.86,560.44,95.75,6.19,140.11,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [11]:
### Chunking our sentences together

#We will split into groups of 10 sentences.

In [12]:
#Define split size for turning grps of sentences into chunks.

num_sentence_chunk_size = 10

# Creating a function that recursively splits a list into desired sizes..
# e.g. list of 30 -> [10, 10, 10]

def split_list(input_list: list, 
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:

    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [13]:
### loop through pages and texts and split them into chunks

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [14]:
random.sample(pages_and_texts, k=1)

[{'page_number': 239,
  'page_char_count': 476,
  'page_word_count': 79,
  'page_sentence_count_raw': 4,
  'page_token_count': 119.0,
  'text': 'recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=175  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=175  Introduction  |  239',
  'sentences': ['recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',
   ' \xa0 An interactive or media element has been  excluded from this version of the text.',
   'You can  view it online here:  http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=175  \xa0 An interactive or media element has been  excluded 

In [15]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32,1.53
std,348.86,560.44,95.75,6.19,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [16]:
### Splitting each chunk into its own item

In [17]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}   #empty dictionry
        chunk_dict["page_number"] = item["page_number"] ### from which page chunk came from. 
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [18]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 1091,
  'sentence_chunk': 'Image by Tomasz Sienick / CC BY- SA 3.0 Risk Factors for Osteoporosis A risk factor is defined as a variable that is linked to an increased probability of developing a disease or adverse outcome. Recall that advanced age and being female increases the likelihood for developing osteoporosis. These factors present risks that should signal doctors and individuals to focus more attention on bone health, especially when the risk factors exist in combination. This is because not all risk factors for osteoporosis are out of your control. Risk factors such as age, gender, and race are biological risk factors, and are based on genetics that cannot be changed. By contrast, there are other risk factors that can be modified, such as physical activity, alcohol intake, and diet. The changeable risk factors for osteoporosis provide a mechanism to improve bone health even though some people may be genetically predisposed to the disease. \xa0 Physical Activit

In [19]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.83,112.72,183.71
std,347.79,447.43,71.07,111.86
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [20]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 29.0 | Text: Journal of Nutrition, 138(6), 1250S–4S. http:/ /jn.nutrition.org/content/138/6/ 1250S.long The Digestive System | 71
Chunk token count: 24.5 | Text: http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=225 330 | Digestion and Absorption of Lipids
Chunk token count: 24.25 | Text: There are several lecithin supplements on the market Nonessential and Essential Fatty Acids | 315
Chunk token count: 27.5 | Text: Iron Status and Exercise. The American Journal of Clinical Nutrition, 72(2), 594S–597S. Sports Nutrition | 967
Chunk token count: 17.75 | Text: Table 6.1 Essential and Nonessential Amino Acids Defining Protein | 365


In [21]:
### Embedding text chunks

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2",
                                    device="cpu")

# Create a list of sentences

sentences = ["Today is a sunny day","The cat chased the mouse.",
             "The sun sets behind the mountains, casting a warm glow.",
             " I love Sunny day!"]

# Sentences are encoded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embeddings: {embedding}")
    print("")



Sentence: Today is a sunny day
Embeddings: [-7.46822264e-03 -2.32481654e-03 -1.30726798e-02 -1.36856707e-02
  4.48430190e-03 -4.14167754e-02 -1.46913854e-02 -2.07767244e-02
  1.99957639e-02  1.39832990e-02 -2.80092433e-02  9.16396156e-02
  4.51908866e-03 -5.64476512e-02  2.48426665e-02 -1.04852274e-01
  2.34691426e-02 -2.50755381e-02 -1.45190340e-02  1.23618701e-02
 -7.39326421e-03  1.92255399e-03  2.95559727e-02 -7.34867062e-03
 -1.98724191e-03 -4.99101169e-02  1.39010958e-02  1.45524191e-02
 -2.83754081e-03  5.04287286e-03 -5.37461340e-02 -2.75936183e-02
 -5.75160887e-03 -5.65932952e-02  1.75024616e-06  3.50039081e-05
  1.20414263e-02  7.00148381e-03  3.49790193e-02 -5.43604465e-03
 -4.61494811e-02 -4.55604047e-02  8.90954211e-03  7.36638345e-03
  3.99158709e-03  9.99210030e-03 -1.81916484e-03  5.24166180e-03
 -2.72025764e-02  3.39604504e-02 -5.05397050e-03  1.12712001e-02
 -1.32356631e-02  9.24892910e-03 -6.39619753e-02  2.92609278e-02
  1.92868989e-03 -2.16285326e-02 -1.14451135e-0

In [22]:
embeddings[0].shape

(768,)

In [23]:
%%time

embedding_model.to("cpu")

#Embed each chunk one by one
for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1843 [00:00<?, ?it/s]

CPU times: total: 24min 34s
Wall time: 10min 35s


In [24]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]

In [27]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=64,
                                               convert_to_tensor=True) 

text_chunk_embeddings

CPU times: total: 53min 16s
Wall time: 17min 48s


tensor([[ 0.0441,  0.0924,  0.0033,  ..., -0.0139, -0.0256,  0.0086],
        [ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]])

In [28]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [29]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-41,Human Nutrition: 2020 Edition,29,4,7.25,[ 4.41241972e-02 9.24155116e-02 3.28842155e-...
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242601e-02 9.02281031e-02 -5.09549491e-...
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156381e-02 5.92138581e-02 -1.66167654e-...
3,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79802009e-02 3.39813679e-02 -2.06426568e-...
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566762e-02 3.81274708e-02 -8.46854504e-...


In [32]:
import random

import torch
import numpy as np
import pandas as pd

device = "cpu"

#Import text and embedding df
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

#Convert texts and embedding df to list of dicts
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep= " "))

# Convert our embeddings into torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].tolist(), axis=0))

# Converting texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

text_chunks_and_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-41,Human Nutrition: 2020 Edition,29,4,7.25,"[0.0441241972, 0.0924155116, 0.00328842155, -0..."
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.0674242601, 0.0902281031, -0.00509549491, -..."
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.0552156381, 0.0592138581, -0.0166167654, -0..."
3,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.50,"[0.0279802009, 0.0339813679, -0.0206426568, 0...."
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,"[0.0682566762, 0.0381274708, -0.00846854504, -..."
...,...,...,...,...,...,...
1838,1164,Flashcard Images Note: Most images in the flas...,1305,176,326.25,"[0.0185622517, -0.0164277963, -0.0127045568, -..."
1839,1164,Hazard Analysis Critical Control Points reused...,375,51,93.75,"[0.0334720686, -0.0570441112, 0.015148947, -0...."
1840,1165,ShareAlike 11. Organs reused “Pancreas Organ A...,1286,173,321.50,"[0.0770515501, 0.00978557672, -0.0121817188, 0..."
1841,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,410,59,102.50,"[0.103045195, -0.0164702553, 0.00826845132, 0...."


In [33]:
embeddings.shape

torch.Size([1843, 768])

In [38]:
# create model
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2",device=device)


In [39]:
##Embedding model is ready. 

## Now we will create a semantic search pipeline.
