In [1]:
import torch 

## Create and run a local RAG pipeline from scratch

The goal is to take information and pass it to an LLM. 

* Retrival - Find relevant information given to a query. 
* Augmented - We want to take the relevant information and augment our input(prompt) to an LLM
* Generation - Take the first two steps and pass them to an LLM 



### Why RAG?

1. Prevent hallucination 
2. Work with custom Data



1. Open a PDF
2. Format the text of the PDF TExtbook ready
3. Embed all of the chunks of text in the textbook
4. Build a Retrieval system that uses vector serach
5. Create a prompt
6. Generate an answer

### 1. Document / Text procesing and embedding creation 


In [2]:
import os 
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    filename = pdf_path

    response = requests.get(url)

    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
        print("[INFO] the file has been download and saved")
    else:
        print(f"[INFO] failed to download the file. Status Code: {response.status_code}")
else:
    print(f"File {pdf_path} exists")

File human-nutrition-text.pdf exists


In [3]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text:str) -> str:
    cleaned_text = text.replace("\n" , " "). strip()

    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text= text_formatter(text=text)
        pages_and_texts.append({"Page_number": page_number - 41, "page_char_count": len(text), 
                                "page_word_count": len(text.split(" ")),
                                 "page_sentence_count_raw": len(text.split(". ")),
                                  "page_token_count": len(text) /4,
                                   "text": text })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'Page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'Page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [4]:
import pandas as pd 
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,Page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
from spacy.lang.en import English

nlp = English()

#Add a sentencizer

nlp.add_pipe("sentencizer")




<spacy.pipeline.sentencizer.Sentencizer at 0x1f7306886d0>

In [6]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item['text']).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [7]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,Page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


## Chunking the sentences together
THe concept of splitting larger pieces of text into smaller ones is often referred to as text splitting or chunking. 

There is no 100% correct way to do this. 

In [8]:
num_sentence_chunk_size = 10

def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)

    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,Page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [11]:
import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["Page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) /4 

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [12]:
df = pd.DataFrame(pages_and_chunks)

In [13]:
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count : {row[1]["chunk_token_count"]}| Text: {row[1]["sentence_chunk"]}')

Chunk token count : 12.5| Text: Polan EU, Taylor DR. (2003),  782  |  Introduction
Chunk token count : 24.75| Text: Unfortunately, a  mother’s intention alone may not be enough to make this practice  Infancy  |  821
Chunk token count : 23.5| Text: Building a protein involves three steps: transcription, translation,  Defining Protein  |  369
Chunk token count : 21.5| Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=283    Alcohol Metabolism  |  441
Chunk token count : 28.75| Text: Fluid  balance refers to maintaining the distribution of water in the body. 386  |  Protein’s Functions in the Body


In [14]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"]> min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 54,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 32,
  'chunk_token_count': 53.0},
 {'page_number': -37,
  'sentence_chunk': 'Contents  Preface  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  xxv  About the Contributors  University of Ha

### Embedding the chunks

In [15]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")

sentences = ["The Sentence Transformer Library provides an easy way to create embeddings", "Sentences can be embeedded on by one or in a lost", "I like horses!"] 


embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")



Sentence: The Sentence Transformer Library provides an easy way to create embeddings
Embedding: [-3.17512564e-02  3.37267891e-02 -2.52437778e-02  5.22287637e-02
 -2.35248990e-02 -6.19114190e-03  1.35026118e-02 -6.25500977e-02
  7.50828627e-03 -2.29684431e-02  2.98147015e-02  4.57555018e-02
 -3.26700285e-02  1.39847510e-02  4.18013781e-02 -5.92969283e-02
  4.26309742e-02  5.04662190e-03 -2.44552735e-02  3.98593675e-03
  3.55897695e-02  2.78742835e-02  1.84098538e-02  3.67699824e-02
 -2.29960773e-02 -3.01796980e-02  5.99522900e-04 -3.64504121e-02
  5.69104627e-02 -7.49940984e-03 -3.70004140e-02 -3.04358406e-03
  4.64355052e-02  2.36151065e-03  9.06849777e-07  7.00037042e-03
 -3.92289422e-02 -5.95696177e-03  1.38653032e-02  1.87108153e-03
  5.34202456e-02 -6.18613772e-02  2.19613612e-02  4.86051142e-02
 -4.25697677e-02 -1.69858914e-02  5.04178368e-02  1.54734040e-02
  8.12859386e-02  5.07106148e-02 -2.27497090e-02 -4.35721092e-02
 -2.18388671e-03 -2.14091521e-02 -2.01758463e-02  3.0683273

In [16]:
%%time

embedding_model.to('cuda')

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]


for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1685 [00:00<?, ?it/s]

CPU times: total: 4.77 s
Wall time: 42.6 s


In [17]:
%%time 
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size =32, convert_to_tensor=True )

CPU times: total: 3.38 s
Wall time: 14.7 s


In [18]:
## Save Embeddings 
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunk_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [19]:
text_chunks_and_embeddings_df_load = pd.read_csv("text_chunk_and_embeddings_df.csv")
