<a href="https://colab.research.google.com/github/Krupa049/RAG/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Creating a local RAG Pipeline

In [3]:
import os
import requests

In [4]:
path = "/content/Human-Nutrition-2020-Edition-1598491699.pdf"

In [5]:
pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.1 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.2 PyMuPDFb-1.24.1


In [6]:
import fitz  # PyMuPDF
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
  # Performs minor formatting on text.
  cleaned_text = text.replace("\n", " ").strip()

  # More text formatting functions potentially
  return cleaned_text

def open_and_read_pdf(path: str) -> list[dict]:
  doc = fitz.open(path)
  pages_and_texts = []
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text=text)
    pages_and_texts.append({"page_number": page_number - 41,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count_raw": len(text.split(". ")),
                            "page_token_count": len(text) / 4, # 1 token = ~4 characters
                            "text": text})

  return pages_and_texts

pages_and_texts = open_and_read_pdf(path=path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [7]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 299,
  'page_char_count': 763,
  'page_word_count': 126,
  'page_sentence_count_raw': 5,
  'page_token_count': 190.75,
  'text': 'n/a  Note that removing the lipid elements from food also takes away  the food’s fat-soluble vitamin content. When products such as grain  and dairy are processed, these essential nutrients are lost.  Manufacturers replace these nutrients through a process called  enrichment.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  The Functions of Lipids in the Body  |  299'},
 {'page_number': 908,
  'page_char_count': 1965,
  'page_word_count': 333,
  'page_

In [8]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


In [10]:
# Further text processing (splitting pages into sentences)

In [11]:
pip install spacy



In [12]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Example
doc = nlp("This is a sentence. Another sentence. Another one.")
assert len(list(doc.sents)) == 3

# Print out the sentences split
list(doc.sents)

[This is a sentence., Another sentence., Another one.]

In [13]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 864,
 'page_word_count': 137,
 'page_sentence_count_raw': 8,
 'page_token_count': 216.0,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5.\xa0Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.  \xa0https:/ /ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitami

In [14]:
for item in tqdm(pages_and_texts):
  item["sentences"] = list(nlp(item["text"]).sents)

  # Make sure all sentences are strings (the default type is a spaCy datatype)
  item["sentences"] = [str(sentence) for sentence in item ["sentences"]]

  # Count the sentences
  item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [15]:
random.sample(pages_and_texts, k=1)

[{'page_number': 486,
  'page_char_count': 1283,
  'page_word_count': 232,
  'page_sentence_count_raw': 11,
  'page_token_count': 320.75,
  'text': 'Hypothalam us by  Methoxyroxy ~commonswi ki / Public  Domain  contains distinct centers of neural circuits that regulate hunger and  satiety (Figure 8.7).  Figure 8.7 Sagittal View of the Brain  This is a scan of a brain. The hypothalamus contains distinct centers  of neural circuits that regulate hunger and satiety.  Hunger pangs are real and so is a “growling” stomach. When the  stomach is empty it contracts, producing the characteristic pang  and “growl.” The stomach’s mechanical movements relay neural  signals to the hypothalamus, which relays other neural signals to  parts of the brain. This results in the conscious feeling of the need  to eat. Alternatively, after you eat a meal the stomach stretches and  sends a neural signal to the brain stimulating the sensation of satiety  and relaying the message to stop eating. The stomach also

In [16]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32
std,348.86,560.44,95.75,6.19,140.11,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [17]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split lists of texts recursively into chunk size
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]

def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [18]:
# loop through pages and texts and spliting sentences into small chunks

for item in tqdm(pages_and_texts):
  item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                       slice_size=num_sentence_chunk_size)
  item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [19]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1126,
  'page_char_count': 1412,
  'page_word_count': 210,
  'page_sentence_count_raw': 18,
  'page_token_count': 353.0,
  'text': 'lifetime.4 Treatment often involves antidepressant medication as  well as nutritional and psychiatric counseling.  Orthorexia Nervosa  Orthorexia nervosa was coined in 1997 by physician Steven  Bratman.5 The term uses “ortho,” in its meaning as straight, correct  and true and refers to a fixation on eating proper food.6 Fixation  on ‘healthy eating’ by those with orthorexia nervosa often results in  behaviors that end up damaging one’s well-being such as extreme  weight loss or a refusal to dine out with friends. Orthorexia nervosa  like anorexia nervosa involves restriction of the amount and variety  of foods eaten, however those with orthorexia nervosa do not have  4.\xa0Eating Disorders. The National Institute of Mental  Health. https:/ /www.nimh.nih.gov/health/statistics/ eating-disorders.shtml#part_155061. Accessed April 15,  2018.  5

In [20]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32,1.53
std,348.86,560.44,95.75,6.19,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [21]:
import re

# Splitting each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
  for sentence_chunk in item["sentence_chunks"]:
    chunk_dict = {}
    chunk_dict["page_number"] = item["page_number"]

    # Joining the sentences together into a paragraph structure
    joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)  #

    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    # Chunk stats
    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [22]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 89,
  'sentence_chunk': 'As with all other cells, the cells in the blood are surrounded by a plasma membrane, which is composed of mainly lipids. Blood health is also acutely sensitive to deficiencies in some vitamins and minerals more than others. What Can Blood Tests Tell You About Your Health? Figure 2.13 Blood Tests The Cardiovascular System | 89',
  'chunk_char_count': 327,
  'chunk_word_count': 56,
  'chunk_token_count': 81.75}]

In [23]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.83,112.72,183.71
std,347.79,447.43,71.07,111.86
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [24]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25


In [25]:
# Show random chunks with under 30 tokens in length

min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
  print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 9.75 | Text: Table 3.5 Salt Substitutes Sodium | 185
Chunk token count: 25.25 | Text: The Polynesian Family System in Ka-‘u. Rutland, Vermont: Charles E. Tuttle Company 780 | Introduction
Chunk token count: 21.0 | Text: http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=84   The Digestive System | 81
Chunk token count: 11.0 | Text: 978 | Food Supplements and Food Replacements
Chunk token count: 26.25 | Text: http:/ /www.ncbi.nlm.nih.gov/pubmed/20182023. Accessed September 22, 2017. 220 | Popular Beverage Choices


In [26]:
# Filter our DataFrame for rows with under 30 tokens

pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [27]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 1018,
  'sentence_chunk': 'transforming raw ingredients into packaged food, from fresh-baked goods to frozen dinners. Although there are numerous benefits to both, preservation and processing also pose some concerns, in terms of both nutrition and sustainability. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been excluded from this version of the text. You can view it online here: http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=532 \xa0

In [28]:
# Embedding the text chunks

In [29]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-no

In [30]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

# Create a list of sentences
sentences = ["Here is a sentence.",
             "Another sentence.",
             "I am confused!"]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# Embeddings Lists
for sentence, embedding in embeddings_dict.items():
  print(f"Sentence: {sentence}")
  print(f"Embedding: {embedding}")
  print("")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence: Here is a sentence.
Embedding: [ 3.89740360e-03 -1.77534707e-02 -3.02865244e-02 -2.57069664e-03
 -3.63935642e-02 -1.27866468e-03 -1.70469601e-02 -4.90054674e-03
 -1.28761902e-02  1.47956861e-02  4.12489735e-02  3.83685604e-02
  2.05121450e-02 -6.56485111e-02 -1.27984695e-02 -3.18547636e-02
  7.93199614e-02  1.05651570e-02 -9.44026373e-03  3.40160951e-02
  1.03330566e-03  1.10812075e-02 -3.53231677e-03  6.44044857e-03
 -1.53879458e-02 -2.03665495e-02  2.57618958e-03 -2.13848930e-02
  8.87307897e-03 -3.57284658e-02  1.76399816e-02 -1.02649964e-02
 -2.92293448e-02 -7.12109432e-02  1.94601967e-06  7.03552831e-03
  4.21524188e-03 -4.73057628e-02 -4.33856919e-02  2.79300213e-02
 -3.06536146e-02  5.37747219e-02 -1.33046955e-02  3.73299718e-02
  6.14583259e-03  6.49188682e-02  7.32067004e-02  7.05367774e-02
 -6.54531941e-02  2.88920198e-02 -6.61313441e-03  3.77911259e-03
 -2.98170559e-02 -5.20305969e-02  5.81153259e-02  1.23470286e-02
  1.88876092e-02 -1.56858657e-02 -1.05365841e-02 

In [31]:
embeddings[0].shape

(768,)

In [32]:
embedding = embedding_model.encode("I am looking for a sentence!")
embedding

array([ 1.67078432e-02,  1.21004879e-02, -3.91676649e-02, -3.67974304e-02,
       -7.24106953e-02, -9.19673126e-03, -1.03393635e-02,  1.25510367e-02,
       -1.87577270e-02,  2.82216091e-02,  6.92405552e-02,  1.23050306e-02,
        3.08259875e-02, -4.12915461e-02,  1.42212380e-02, -3.73743773e-02,
        5.31831235e-02,  7.73361744e-03, -2.00921251e-03,  4.07945290e-02,
        1.04231006e-02,  3.63206416e-02, -7.59815099e-03,  2.85440572e-02,
       -3.56274024e-02, -2.25593504e-02, -2.50923466e-02, -6.65900158e-03,
        5.47882430e-02,  6.44997647e-03,  6.64729103e-02, -1.33741554e-02,
       -3.25702429e-02, -2.21047513e-02,  1.65117001e-06,  1.29004652e-02,
       -5.41268056e-03, -5.21515571e-02, -6.64864779e-02,  1.30543709e-02,
        1.89365186e-02, -5.95540134e-03,  7.60598062e-03,  3.83773856e-02,
       -9.26832436e-04,  6.71942011e-02,  4.53703403e-02,  1.96024310e-02,
       -3.66201811e-02,  2.39488911e-02,  1.66002847e-02,  5.44944871e-03,
        3.06720263e-03, -

In [33]:
%%time

embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
  item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 19min 5s, sys: 19.1 s, total: 19min 25s
Wall time: 2min 27s


In [34]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[419]

CPU times: user 494 µs, sys: 0 ns, total: 494 µs
Wall time: 506 µs


'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to their fascinating food creations. Add

In [35]:
len(text_chunks)

1680

In [36]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)
text_chunk_embeddings

CPU times: user 15min 54s, sys: 4min 42s, total: 20min 37s
Wall time: 2min 50s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]])

In [37]:
pages_and_chunks_over_min_token_len[419]

{'page_number': 277,
 'sentence_chunk': 'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture 

In [38]:
# Save embeddings to file

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [39]:
# Import saved file and view

text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242228e-02 9.02280435e-02 -5.09549212e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156493e-02 5.92139289e-02 -1.66167356e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801767e-02 3.39813977e-02 -2.06426550e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566762e-02 3.81275006e-02 -8.46854411e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264196e-02 -8.49767122e-03 9.57159698e-...


In [40]:
# RAG Search and Answer

In [49]:
import random

import torch
import numpy as np
import pandas as pd

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it is saved to csv)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert our embeddings into a torch.tensor
embeddings = np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0)

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embedding_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.0674242228, 0.0902280435, -0.00509549212, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.0552156493, 0.0592139289, -0.0166167356, -0..."
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.50,"[0.0279801767, 0.0339813977, -0.020642655, 0.0..."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,"[0.0682566762, 0.0381275006, -0.00846854411, -..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.0330264196, -0.00849767122, 0.00957159698, ..."
...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1305,176,326.25,"[0.0185623113, -0.0164278485, -0.0127045661, -..."
1676,1164,Hazard Analysis Critical Control Points reused...,375,51,93.75,"[0.0334721729, -0.0570439622, 0.0151489042, -0..."
1677,1165,ShareAlike 11. Organs reused “Pancreas Organ A...,1286,173,321.50,"[0.077051498, 0.00978555437, -0.0121817235, 0...."
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,410,59,102.50,"[0.103045076, -0.0164701883, 0.00826844852, 0...."


In [53]:
text_chunks_and_embedding_df["embedding"]

0       [0.0674242228, 0.0902280435, -0.00509549212, -...
1       [0.0552156493, 0.0592139289, -0.0166167356, -0...
2       [0.0279801767, 0.0339813977, -0.020642655, 0.0...
3       [0.0682566762, 0.0381275006, -0.00846854411, -...
4       [0.0330264196, -0.00849767122, 0.00957159698, ...
                              ...                        
1675    [0.0185623113, -0.0164278485, -0.0127045661, -...
1676    [0.0334721729, -0.0570439622, 0.0151489042, -0...
1677    [0.077051498, 0.00978555437, -0.0121817235, 0....
1678    [0.103045076, -0.0164701883, 0.00826844852, 0....
1679    [0.0863774195, -0.0125358328, -0.0112746591, 0...
Name: embedding, Length: 1680, dtype: object

In [54]:
embeddings.shape

(1680, 768)

In [55]:
# Create model

from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

In [56]:
# Embedding model is done