# document preprocessing

In [1]:
import fitz
from tqdm.auto import tqdm
import random
import pandas as pd
from spacy.lang.en import English
import re
import random

import torch
import numpy as np
import pandas as pd
from sentence_transformers import util, SentenceTransformer

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
from time import perf_counter as timer


helpers and text formatter

In [2]:
def text_formatter(text:str) ->str:
    """Minor formatting on text"""
    cleaned_text = text.replace("\n"," ").strip()

    # more

    return cleaned_text

def open_read_pdf(pdf_path:str)-> list[dict]:
    doc = fitz.open(pdf_path)

    pages_and_text = []

    for i,page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append({'page_number':i - 41,
                               "page_char_count":len(text),
                               'page_word_count':len(text.split(" ")),
                               "page_sent_count_raw":len(text.split(". ")),
                               "page_token_count":(len(text) / 4),
                               "text":text})

    return pages_and_text 


read file

In [3]:
pdf_path = 'human-nutrition-text.pdf'

pages = open_read_pdf(pdf_path)

0it [00:00, ?it/s]

break into sentences 

In [4]:
nlp = English()

nlp.add_pipe("sentencizer")

for item in tqdm(pages):
    item['sentences'] = list(nlp(item['text']).sents)

    item['sentences'] = [str(s) for s in item['sentences']]

    item['page_sentence_count'] = len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

chunking or splitting into groups of 10 sentences

In [5]:
chunk_size = 10


def split_list(input_list: list, slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [
        input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)
    ]

for item in tqdm(pages):
    item['sentence_chunks'] = split_list(item['sentences'],slice_size=chunk_size)

    item['num_chunks'] = len(item['sentence_chunks'])

  0%|          | 0/1208 [00:00<?, ?it/s]

split each chunk into own item

In [6]:
chunks = []

for item in tqdm(pages):
    for sentence_chunk in item['sentence_chunks']:
        cd = {}
        cd['page_number'] = item['page_number']

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        cd["sentence_chunk"] = joined_sentence_chunk

        cd["chunk_char_count"] = len(joined_sentence_chunk)
        cd["chunk_word_count"] = len(
            [word for word in joined_sentence_chunk.split(" ")]
        )
        cd["chunk_token_count"] = (
            len(joined_sentence_chunk) / 4
        )  # 1 token = ~4 characters

        chunks.append(cd)

  0%|          | 0/1208 [00:00<?, ?it/s]

In [7]:
df = pd.DataFrame(chunks)

In [8]:
min_token_len = 30
pages_and_chunks_over_min_token_len = df[
    df["chunk_token_count"] > min_token_len
].to_dict(orient="records")

# embeddings

In [2]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

In [3]:
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2',device=device)

In [4]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

NameError: name 'pages_and_chunks_over_min_token_len' is not defined

In [12]:
%%time
text_chunk_embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,  # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
    convert_to_tensor=True,
)  # optional to return embeddings as tensor

CPU times: user 18.1 s, sys: 161 ms, total: 18.3 s
Wall time: 9.76 s


In [26]:
#Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

# rag

In [5]:


device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df[
    "embedding"
].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(
    np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32
).to(device)
embeddings.shape

torch.Size([1680, 768])

In [6]:
def retrieve_relevant_resources(
    query: str,
    embeddings: torch.tensor,
    model: SentenceTransformer = embedding_model,
    n_resources_to_return: int = 5,
    print_time: bool = True,
):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)
    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(
            f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds."
        )

    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)

    return scores, indices


In [7]:
import torch

gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 8 GB


In [8]:
model_id = "google/gemma-2b-it"
use_quantization_config = True

In [9]:
torch.cuda.get_device_capability(0)

(8, 6)

In [10]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=False, bnb_4bit_compute_dtype=torch.float16
)
attn_implementation = "flash_attention_2"

model_id = model_id  # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

[INFO] Using model_id: google/gemma-2b-it


In [11]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

In [12]:
llm_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_id,
    torch_dtype=torch.float16,  # datatype to use, we want float16
    quantization_config=quantization_config if use_quantization_config else None,
    low_cpu_mem_usage=False,  # use full memory
    attn_implementation=attn_implementation,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
   

# augment input

In [14]:
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management.",
]

# Manually created question list
manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins",
]

query_list = gpt4_questions + manual_questions

In [15]:
def prompt_formatter(query: str, context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [{"role": "user", "content": base_prompt}]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(
        conversation=dialogue_template, tokenize=False, add_generation_prompt=True
    )
    return prompt

In [16]:
def ask(
    query,
    temperature=0.7,
    max_new_tokens=512,
    format_answer_text=True,
    return_answer_only=True,
):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()  # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query, context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(
        **input_ids,
        temperature=temperature,
        do_sample=True,
        max_new_tokens=max_new_tokens
    )

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = (
            output_text.replace(prompt, "")
            .replace("<bos>", "")
            .replace("<eos>", "")
            .replace("Sure, here is the answer to the user query:\n\n", "")
        )

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [17]:
import textwrap


def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [18]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer,context_items = ask(
    query=query, temperature=0.7, max_new_tokens=512, return_answer_only=False
)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items

Query: What role does fibre play in digestion? Name five fibre containing foods.
[INFO] Time taken to get scores on 1680 embeddings: 0.00006 seconds.
Answer:

The relevant passages from the context are:  **Passage 1:**  > Fiber is
categorized as either water-soluble or insoluble. Some examples of soluble
fibers are inulin, pectin, and guar gum and they are found in peas, beans, oats,
barley, and rye. Cellulose is the most abundant fiber in plants, making up the
cell walls and providing structure.  **Passage 2:**  > Fiber intake has been
linked to a decreased risk for colon cancer, but the exact mechanisms are not
fully understood. However, some studies have found that a diet rich in dietary
fiber may be linked to a lower risk of colorectal tumors.  **Passage 3:**  >
Fiber in food can provide many health benefits. It can help regulate blood
pressure, cholesterol levels, and blood glucose levels. Fiber can also help
maintain a healthy digestive tract, reducing the risk of constipation an

[{'page_number': 1086,
  'sentence_chunk': 'Image by Allison Calabrese / CC BY 4.0 fiber intake because of what the breakdown products of the fiber do for the colon. The bacterial breakdown of fiber in the large intestine releases short-chain fatty acids. These molecules have been found to nourish colonic cells, inhibit colonic inflammation, and stimulate the immune system (thereby providing protection of the colon from harmful substances). Additionally, the bacterial indigestible fiber, mostly insoluble, increases stool bulk and softness increasing transit time in the large intestine and facilitating feces elimination. One phenomenon of consuming foods high in fiber is increased gas, since the byproducts of bacterial digestion of fiber are gases. Figure 18.2 Diverticulitis: A Disease of Fiber Deficiency Some studies have found a link between high dietary-fiber intake and a decreased risk for colon cancer. However an analysis of 1086 | Nutrition, Health and Disease',
  'chunk_char_coun