In [1]:
import fitz # pip install pymupdf
from tqdm.auto import tqdm

In [3]:
def text_formatter(text: str) -> str:

    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

In [4]:
def pdf_to_text(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({
            "page_number": page_number - 41,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_line_count": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })
    return pages_and_texts

In [5]:
pages_and_texts = pdf_to_text(pdf_path="./human-nutrition-text.pdf")
pages_and_texts[:10]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_line_count': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_line_count': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -39,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_line_count': 1,
  'page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'},
 {'page_number': -38,
  'page_char_count': 212,
  'page_word_count': 32,
  'page_line_count': 1,
  'page_token_count': 53.0,
  'text': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Pro

In [6]:
import random

random.sample(pages_and_texts, k=2)

[{'page_number': 164,
  'page_char_count': 859,
  'page_word_count': 146,
  'page_line_count': 14,
  'page_token_count': 214.75,
  'text': 'are higher than the average intake of 2.2 liters. It is important to  note that the AI for water includes water from all dietary sources;  that is, water coming from food as well as beverages. People are  not expected to consume 15.6 or 11 cups of pure water per day.  In America, approximately 20 percent of dietary water comes from  solid foods. See Table 3.1 “Water Content in Foods” for the range of  water contents for selected food items. Beverages includes water,  tea, coffee, sodas, and juices.  Table 3.1 Water Content in Foods  for Electrolytes and Water. (2005). Dietary Reference  Intakes for Water, Potassium, Sodium, Chloride, and  Sulfate. The National Academies of Science, Engineering,  and Medicine. Washington D.C. http://www.nap.edu/ openbook.php?record_id=10925&page=73. Accessed  September 22, 2017.  164  |  Regulation of Water Balance'

In [2]:
import pandas as pd
import numpy as np

In [8]:
df = pd.DataFrame(pages_and_texts)

df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_line_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_line_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


### Further Text Processing

In [9]:
from spacy.lang.en import English

In [10]:
nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Create a document Instance
doc = nlp("This is a sentence. I like Elephants. That is another example")

assert len(list(doc.sents)) == 3

list(doc.sents)

[This is a sentence., I like Elephants., That is another example]

In [11]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy dataType)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [12]:
pages_and_texts[50]

{'page_number': 9,
 'page_char_count': 1320,
 'page_word_count': 215,
 'page_line_count': 4,
 'page_token_count': 330.0,
 'text': 'Minerals  Major Functions  Macro  Sodium  Fluid balance, nerve transmission, muscle contraction  Chloride  Fluid balance, stomach acid production  Potassium  Fluid balance, nerve transmission, muscle contraction  Calcium  Bone and teeth health maintenance, nerve transmission,  muscle contraction, blood clotting  Phosphorus  Bone and teeth health maintenance, acid-base balance  Magnesium  Protein production, nerve transmission, muscle  contraction  Sulfur  Protein production  Trace  Iron  Carries oxygen, assists in energy production  Zinc  Protein and DNA production, wound healing, growth,  immune system function  Iodine  Thyroid hormone production, growth, metabolism  Selenium  Antioxidant  Copper  Coenzyme, iron metabolism  Manganese  Coenzyme  Fluoride  Bone and teeth health maintenance, tooth decay  prevention  Chromium  Assists insulin in glucose metabo

In [13]:
df = pd.DataFrame(pages_and_texts)

df.describe().round(0)

Unnamed: 0,page_number,page_char_count,page_word_count,page_line_count,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.0,1148.0,199.0,11.0,287.0,10.0
std,349.0,560.0,96.0,7.0,140.0,6.0
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,261.0,762.0,134.0,5.0,190.0,5.0
50%,562.0,1232.0,216.0,10.0,308.0,10.0
75%,864.0,1604.0,272.0,15.0,401.0,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [14]:
def split_list(input_list: list[str], split_size: int = 10) -> list[list[str]]:
    return [input_list[i:i+split_size] for i in range(0, len(input_list), split_size)]

In [15]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"])

    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [16]:
random.sample(pages_and_texts, k=1)

[{'page_number': 751,
  'page_char_count': 1266,
  'page_word_count': 240,
  'page_line_count': 16,
  'page_token_count': 316.5,
  'text': 'with 1 tsp. olive oil,  40     with 1 tsp. sesame seeds  18  ½ c. cooked wild rice  83     with ½ c. chopped kale  18  1 whole-wheat dinner roll  4     with 1 tsp. almond butter  33  691  (Total calories from all meals and  snacks = 1,814)  Discretionary calorie allowance: 186  (Total calories from all meals and snacks = 1,814)  Discretionary calorie allowance: 186  Healthy Eating Index  To assess whether the American diet is conforming to the Dietary  Guidelines, the Center for Nutrition Policy and Promotion (CNPP),  a division of the USDA, uses a standardized tool called the Healthy  Eating Index (HEI)2.  The first HEI was developed in 1995 and revised in 2006. This  tool is a simple scoring system of dietary components. The data for  scoring diets is taken from national surveys of particular population  subgroups, such as children from low-incom

In [17]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_line_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


#### Spliting each chunk to its own item

In [16]:
import re

In [19]:
# Split each chunk to its own item

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token ~= 4 chars

        pages_and_chunks.append(chunk_dict)


  0%|          | 0/1208 [00:00<?, ?it/s]

In [20]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 1083,
  'sentence_chunk': 'Oral Disease  Oral health refers not only to healthy teeth and gums, but also to the  health of all the supporting tissues in the mouth such as ligaments,  nerves, jawbone, chewing muscles, and salivary glands. Over ten  years ago the Surgeon General produced its first report dedicated  to oral health, stating that oral health and health in general are not  separate entities.2  Instead, oral health is an integral part of overall health and well- being. Soft drinks, sports drinks, candies, desserts, and fruit juices  are the main sources of “fermentable sugars” in the American diet. (Fermentable sugars are those that are easily metabolized by  bacteria in a process known as fermentation. Glucose, fructose,  and maltose are three examples.)Bacteria that inhabit the mouth  metabolize fermentable sugars and starches in refined grains to  acids that erode tooth enamel and deeper bone tissues. The acid  creates holes (cavities) in the teeth that ca

In [21]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,749.91,128.55,187.48
std,347.79,455.68,80.01,113.92
min,-41.0,14.0,4.0,3.5
25%,280.5,321.5,53.0,80.38
50%,586.0,762.0,132.0,190.5
75%,890.0,1137.5,195.0,284.38
max,1166.0,1870.0,415.0,467.5


In [22]:
df.sample(5)

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
1612,1031,Genomics. Energy.gov.http://www.ornl.gov/sci/ ...,322,30,80.5
745,468,"In a much less severe example, a person is als...",558,93,139.5
1149,748,Nutrition Facts label provides the information...,1361,223,340.25
1128,734,Qualified Health Claims While health claims m...,1095,176,273.75
295,177,"The symptoms of hyponatremia, also called wat...",469,80,117.25


In [23]:
# Filter Out Dataframes With under 30 tokens
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > 30].to_dict(orient="records")

In [24]:
pages_and_chunks_over_min_token_length[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 54,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 32,
  'chunk_token_count': 53.0}]

In [25]:
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': 1139,
  'sentence_chunk': 'Accessed April 15, 2018. 12. Coleman-Jensen A. Household Food Security in the  United States in 2010. US Department of Agriculture,  Economic Research Report, no. ERR-125.2011.  https://www.ers.usda.gov/publications/pub- details/?pubid=44909. Accessed April 15, 2018. Food Insecurity  |  1139',
  'chunk_char_count': 292,
  'chunk_word_count': 40,
  'chunk_token_count': 73.0}]

### Embedding Our Text Chunks

In [3]:
from sentence_transformers import SentenceTransformer, util

In [10]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

embedding_model.to("cpu")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
embedding_model.encode("I Love Cows")

array([-1.08290492e-02,  1.12156361e-01, -7.06325704e-03, -3.53761278e-02,
        4.84256819e-02,  5.62884174e-02, -6.60703555e-02,  1.73046757e-02,
        4.05186862e-02, -1.29254879e-02, -1.53067810e-02, -2.24408973e-02,
        8.54996499e-03,  5.26127592e-02, -3.21709812e-02,  8.86928383e-03,
        3.35634649e-02,  2.51500569e-02, -3.70987738e-03,  2.89219860e-02,
        4.84897580e-04,  3.32138687e-02, -3.54068130e-02, -4.67082411e-02,
       -4.73706760e-02, -4.07523429e-03,  3.44878121e-04, -1.16349505e-02,
        3.46870832e-02,  5.12307622e-02, -4.99993861e-02, -5.89180216e-02,
       -1.78299833e-03, -5.80947064e-02,  1.51594065e-06,  1.54183758e-02,
       -6.19132593e-02,  2.53136531e-02, -1.65514706e-03,  8.85225236e-02,
       -6.62912289e-03, -5.03774509e-02, -2.84088571e-02, -2.78458446e-02,
        2.11715419e-03,  1.53253138e-01,  4.99914847e-02, -4.00474183e-02,
       -3.34881917e-02,  1.21943038e-02, -4.91741719e-03,  1.05919968e-03,
       -5.70174158e-02, -

In [None]:
embedding_model.to("cuda")

for chunk in tqdm(pages_and_chunks_over_min_token_length):
    chunk["embedding"] = embedding_model.encode(chunk["sentence_chunk"])

In [30]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_length]

In [31]:
text_chunks[450]

'fat is critical to our survival and good health, in large quantities it  can be a deterrent to maintaining good health. Regulating and Signaling  Triglycerides control the body’s internal climate, maintaining  constant temperature. Those who don’t have enough fat in their  bodies tend to feel cold sooner, are often fatigued, and have  pressure sores on their skin from fatty acid deficiency. Triglycerides  also help the body produce and regulate hormones. For example,  adipose tissue secretes the hormone leptin, which regulates  appetite. In the reproductive system, fatty acids are required for  proper reproductive health. Women who lack proper amounts may  stop menstruating and become infertile. Omega-3 and omega-6  essential fatty acids help regulate cholesterol and blood clotting  and control inflammation in the joints, tissues, and bloodstream. Fats also play important functional roles in sustaining nerve impulse  transmission, memory storage, and tissue structure. More  specifical

In [32]:
len(text_chunks)

1685

In [None]:
%%time

text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)

In [None]:
text_chunk_embeddings

In [None]:
embedding_df = pd.DataFrame(pages_and_chunks_over_min_token_length)

embedding_df.to_csv(embedding_df_save_path, index=False)

In [None]:
embedding_df.head()

In [56]:
embedding_df_save_path = "./text_chunks_and_embeddings.csv"


In [57]:
embedding_df_load = pd.read_csv(embedding_df_save_path)


embedding_df_load["embedding"] = embedding_df_load["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ", dtype=np.float32))

embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.0,"[0.06742426, 0.09022816, -0.005095494, -0.0317..."
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.0,"[0.05521559, 0.059213936, -0.016616715, -0.020..."
2,-37,Contents Preface University of Hawai‘i at Mā...,797,147,199.25,"[0.027980188, 0.033981398, -0.02064267, 0.0019..."
3,-36,Lifestyles and Nutrition University of Hawai‘...,976,179,244.0,"[0.06825668, 0.038127493, -0.008468544, -0.018..."
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,"[0.03302644, -0.008497676, 0.0095715895, -0.00..."


In [58]:
embedding_df_load["embedding"][0].dtype

dtype('float32')

In [59]:
pages_and_chunks = embedding_df_load[["sentence_chunk", "page_number", "chunk_word_count"]].to_dict(orient="records")

In [60]:
text_chunk_embeddings_arr = embedding_df_load["embedding"].tolist()

In [61]:
text_chunk_embeddings_arr[1]

array([ 5.52155897e-02,  5.92139363e-02, -1.66167151e-02, -2.04602703e-02,
        6.92422614e-02,  3.51345316e-02, -1.87620074e-02,  3.21568958e-02,
        7.78691173e-02, -8.06518644e-03,  2.60771941e-02,  1.17855081e-04,
        2.36336999e-02,  6.99443556e-03,  1.74458523e-06, -3.82585870e-03,
        3.45728314e-03,  1.16405217e-02,  1.01687647e-02,  4.95471284e-02,
       -5.18356860e-02,  1.88298151e-02,  4.51909862e-02,  4.23135236e-02,
       -4.12121639e-02,  4.93984995e-03,  3.25199701e-02, -1.81734543e-02,
        8.84532928e-03, -6.44744262e-02, -5.04508242e-03,  1.74673516e-02,
       -1.65691343e-03, -8.50823969e-02,  2.46762761e-06, -1.69053506e-02,
        1.09408535e-02,  3.01257893e-02, -6.66744709e-02,  6.21617325e-02,
        3.50563452e-02, -2.47929320e-02, -1.59021318e-02,  2.37372350e-02,
        3.93132940e-02,  4.06050496e-02,  4.51445878e-02, -5.83529472e-03,
       -1.52490763e-02,  8.62988550e-03, -1.96106615e-03, -3.10199708e-02,
       -3.25586908e-02,  

In [62]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [63]:
query = "good food for protein"

In [64]:
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

In [65]:
text_chunk_embeddings = torch.tensor(np.array(text_chunk_embeddings_arr)).to("cpu")

In [66]:
# To Detach variable from cpu
# text_chunk_embeddings = text_chunk_embeddings.detach().cpu().numpy()

In [67]:
# Get similarity score with dot product (use cosine similarity score if output model aren't normalized)

dot_scores = util.dot_score(a=query_embedding, b=text_chunk_embeddings)[0]

In [18]:
dot_scores

tensor([0.3239, 0.3371, 0.3416,  ..., 0.1652, 0.1184, 0.1118])

In [19]:
top_results = torch.topk(dot_scores, k=5)

In [20]:
top_results

torch.return_types.topk(
values=tensor([0.7691, 0.7649, 0.6903, 0.6845, 0.6834]),
indices=tensor([614, 619, 623, 618, 605]))

In [26]:
pages_and_chunks[47]

{'sentence_chunk': 'Water  There is one other nutrient that we must have in large quantities:  water. Water does not contain carbon, but is composed of two  hydrogens and one oxygen per molecule of water. More than 60  percent of your total body weight is water. Without it, nothing could  be transported in or out of the body, chemical reactions would not  occur, organs would not be cushioned, and body temperature would  fluctuate widely. On average, an adult consumes just over two liters  of water per day from food and drink combined. Since water is so  critical for life’s basic processes, the amount of water input and  output is supremely important, a topic we will explore in detail in  Chapter 4. Micronutrients  Micronutrients are nutrients required by the body in lesser  amounts, but are still essential for carrying out bodily functions. Micronutrients include all the essential minerals and vitamins. There are sixteen essential minerals and thirteen vitamins (See  Table 1.1 “Mineral

In [27]:
import textwrap

def print_wrap(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [28]:
print(f"Query: {query}")
print("Results")

for score, idx in zip(top_results[0], top_results[1]):
    print("\n")
    print(f"Score: {score:.4f}")
    print("Text")
    print(pages_and_chunks[idx]["sentence_chunk"])
    print(f"Page Number: {pages_and_chunks[idx]["page_number"]}")

Query: good food for protein
Results


Score: 0.7691
Text
Dietary Sources of Protein  The protein food group consists of foods made from meat, seafood,  poultry, eggs, soy, dry beans, peas, and seeds. According to the  Harvard School of Public Health, “animal protein and vegetable  protein probably have the same effects on health. It’s the protein  package that’s likely to make a difference.”1  1. Protein: The Bottom Line. Harvard School of Public  Proteins, Diet, and Personal Choices  |  411
Page Number: 411


Score: 0.7649
Text
Additionally, a person should consume 8 ounces of cooked seafood  every week (typically as two 4-ounce servings) to assure they are  getting the healthy omega-3 fatty acids that have been linked to a  lower risk for heart disease. Another tip is choosing to eat dry beans,  peas, or soy products as a main dish. Some of the menu choices  include chili with kidney and pinto beans, hummus on pita bread,  and black bean enchiladas. You could also enjoy nuts in a va

### *Note:* We Could Rerank them to imporve the order (Reranking Model)

----

##### Functionizing our Sementic Search

In [22]:
from time import perf_counter as Timer

In [23]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):

    query_embedding = model.encode(query, convert_to_tensor=True)
    
    start_time = Timer()
    dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
    end_time = Timer()

    if print_time:
        print(f"[INFO] time taken for scores: {end_time - start_time:.5f}")

    scores, indices = torch.topk(dot_scores, k=n_resources_to_return)

    return scores, indices

In [24]:
def print_top_results(query: str,
                      embeddings: torch.tensor,
                      pages_and_chunks: dict=pages_and_chunks,
                      n_resources_to_return: int=5):
    scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings, n_resources_to_return=n_resources_to_return)

    for score, idx in zip(scores, indices):
        print("\n")
        print(f"Score: {score:.4f}")
        print("Text")
        print(pages_and_chunks[idx]["sentence_chunk"])
        print(f"Page Number: {pages_and_chunks[idx]["page_number"]}")
            

In [25]:
retrieve_relevant_resources("foods high in fiber", embeddings=text_chunk_embeddings)

[INFO] time taken for scores: 0.00030


(tensor([0.6964, 0.6810, 0.5566, 0.5344, 0.5187]),
 tensor([ 420,  362,  360, 1051,  414]))

In [27]:
print_top_results("foods high in fiber", embeddings=text_chunk_embeddings)

[INFO] time taken for scores: 0.00030


Score: 0.6964
Text
• Change it up a bit and experience the taste and satisfaction of  other whole grains such as barley, quinoa, and bulgur. • Eat snacks high in fiber, such as almonds, pistachios, raisins,  and air-popped popcorn. Add an artichoke and green peas to your dinner plate more  276  |  Carbohydrates and Personal Diet Choices
Page Number: 276


Score: 0.6810
Text
Dietary fiber is categorized as either water-soluble or insoluble. Some examples of soluble fibers are inulin, pectin, and guar gum  and they are found in peas, beans, oats, barley, and rye. Cellulose  and lignin are insoluble fibers and a few dietary sources of them  are whole-grain foods, flax, cauliflower, and avocados. Cellulose is  the most abundant fiber in plants, making up the cell walls and  providing structure. Soluble fibers are more easily accessible to  bacterial enzymes in the large intestine so they can be broken down  to a greater extent than insoluble fibers, 

### Getting an LLM for Local Generation

In [34]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory

In [35]:
torch.cuda.get_device_capability(0)
# meaning is 7.5

(7, 5)

In [36]:
gpu_memory_gb = round(gpu_memory_bytes / (2**30))

In [37]:
gpu_memory_gb

2

In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 

In [29]:
# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

[INFO] Using attention implementation: sdpa


In [30]:
# Create a Quantization Config
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                        bnb_4bit_compute_dtype=torch.float16)

In [None]:
llm_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
    quantization_config= quantization_config
)

llm_model.to("cuda")

llm_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="microsoft/phi-2")

In [50]:
from huggingface_hub import InferenceClient

client = InferenceClient(api_key="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")

messages = [
	{ "role": "user", "content": "What is the capital of France?" }
]

stream = client.chat.completions.create(
    model="microsoft/Phi-3.5-mini-instruct", 
	messages=messages, 
	max_tokens=500,
    temperature=0.7,
)


In [78]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    return dialogue_template

In [108]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=text_chunk_embeddings)
    
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    client = InferenceClient(api_key="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")
    
    stream = client.chat.completions.create(
        model="microsoft/Phi-3.5-mini-instruct", 
    	messages=prompt, 
    	max_tokens=256,
        temperature=0.7,
    )

    print(stream["choices"][0]["message"]["content"])

    

In [111]:
ask("What role does fibre play in digestion? Name five fibre containing foods.")

[INFO] time taken for scores: 0.00268
Fiber plays a crucial role in digestion by promoting healthy bowel movements, reducing inflammation, and enhancing the immune system. It aids in the digestion process through the following mechanisms:

1. Bacterial breakdown products: The bacterial breakdown of fiber in the large intestine releases short-chain fatty acids, which nourish colonic cells, inhibit colonic inflammation, and stimulate the immune system. This provides protection of the colon from harmful substances.

2. Increased stool bulk and softness: Fiber, especially insoluble fiber, increases stool bulk and softness, increasing transit time in the large intestine and facilitating feces elimination.

3. Reduced blood glucose rise: Fiber is digested much less in the gastrointestinal tract than other carbohydrate types (simple sugars, many starches). This results in a slower and lesser rise in blood glucose levels, which is beneficial for overall health.

Five fiber-containing foods fro