### This notebook builds a RAG system from scratch using python and FAISS vector store. I use sample pdf as the document to vectorise and retreive from.

- Steps to follow : 
    1. Open a document 
    2. format the text for the embedding model
    3. embed all the chunks which can be stored for later
    4. build a retrieval system that searches the vector store and returns the similar embeddings to teh query
    5. create a prompt that incorporates the returned embeddings
    6. generate an answer to the query based on the passages from the text.

- Steps 1-3 : Document preprocessing and embedding creation
- Steps 4-6 : Search and Answer

## 1. Document pre-processing and embedding creation

Ingredients : 
- Data documents of any choice
- embedding model of choice

In [None]:
import os 
import requests

pdf_path = './Rag-From-Scratch/simple-local-rag/human-nutrition-text.pdf'


if not os.path.exists(pdf_path):
    print('File does not exist.')


In [None]:
import fitz 
from tqdm.auto import tqdm

def text_formatter(strng):
    cleaned_text = strng.replace('\n', ' ').strip()
    return cleaned_text

def open_and_read(pdf_path):
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({
            'page_number' : page_number - 41,
            'page_char_count' : len(text),
            'page_word_count' : len(text.split(' ')) if len(text) > 0 else 0,
            'page_sentence_count' : len(text.split('. ')) if len(text) > 0 else 0,
            'page_token_count' : len(text)/4, 
            'text' : text
        })
    return pages_and_texts

In [None]:
pages_and_text = open_and_read(pdf_path)
len(pages_and_text)
pages_and_text[:2]

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.head()

## Why care about the token count ?

Token count is important because of the context window of the embedding model and the context window of the LLMs. By context window I mean, the maximum length of the input text provided to the embbedding model or the LLM.

In [None]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe('sentencizer')

# create a document instance as an example.
doc = nlp('This a sentence. This is another sentence. This is the third sentence.')

assert len(list(doc.sents)) == 3

# print sentences split
list(doc.sents)


In [None]:
# Our pdf dictionary
pages_and_text[600]

In [None]:
for item in tqdm(pages_and_text):
    item['sentences'] = list(nlp(item['text']).sents)

    # make sure all sentences are string. Default is a spacy datatype.
    item['sentences'] = [str(strng) for strng in item['sentences']]

    item['sentences_per_page'] = len(item['sentences'])

In [None]:
import random

random.sample(pages_and_text, k=1)

In [None]:
# update dataframe
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

## Chunking approach

- The concept of splitting larger pieces of text into smaller text of suitable sizes or chunking is done to provide appropriate sized inputs to the embedding model and LLM. 
- There is no, one correct way to chunk. It depends on the your use case. Some of the approaches to chunking are fixed size chunking, token and word based chunking, recursive token and word based chunking, semantic chunking, etc. 
- We will use fixed sized chunking here, and go with 10 sentences in a chunk.
- Each page will be subdivided into chunks of 10 sentences or smaller.

In [None]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# create a function to split lists of sentences into chunk size, recursively
def split_list(input_list):
    slice_size = num_sentence_chunk_size
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

# test_list = list(range(25))
# split_list(test_list)

In [None]:
# Loop through pages and split text into chunks

for item in tqdm(pages_and_text):
    item['sentence_chunks'] = split_list(item['sentences'])
    item['num_chunks'] = len(item['sentence_chunks'])

In [None]:
random.sample(pages_and_text, k =1)

In [None]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

### Splitting each chunk into its own item in the document dictionary. This gives a greater level of granularity.

In [None]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict['page_number'] = item['page_number']
        joined_sentence_chunk = ''.join(sentence_chunk).replace('  ', ' ').strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict['sentence_chunk'] = joined_sentence_chunk
        chunk_dict['chunk_char_count'] = len(joined_sentence_chunk)
        chunk_dict['chunk_word_count'] = len(joined_sentence_chunk.split(' '))
        chunk_dict['chunk_token_count'] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)

In [None]:
random.sample(pages_and_chunks, k=1)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

In [None]:
# Filter chunks with very small text length. These chunks might not have useful information.
min_token_length = 30

# for row in df[df['chunk_token_count']<min_token_length].sample(5).iterrows():
#     print(f'Chunk token count : {row[1]['chunk_token_count']} | Text : {row[1]['sentence_chunk']}')

pages_and_chunks_over_min_token_len = df[df['chunk_token_count'] > min_token_length].to_dict(orient='records')
pages_and_chunks_over_min_token_len[:2]

In [None]:
random.sample(pages_and_chunks_over_min_token_len, k=2)

## Embedding our text chunks

In [None]:
# we are using an embedding model from sentence transformer library.

import torch
from sentence_transformers import SentenceTransformer

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
print(device)
embedding_model = SentenceTransformer(model_name_or_path = 'all-mpnet-base-v2', device=device)

# for item in tqdm(pages_and_chunks_over_min_token_len):
#     # sentences are encoded by calling .encode on the model
#     item['embeddings'] = embedding_model.encode(item['sentence_chunk'])

text_chunks = [item['sentence_chunk'] for item in pages_and_chunks_over_min_token_len]
# text_chunks[419]

text_chunk_embeddings = embedding_model.encode(
    text_chunks,
    batch_size = 32,
    convert_to_tensor=True
)

text_chunk_embeddings



In [None]:
# Implementing FAISS Vector store

import numpy as np
import faiss


# Convert the tensor from GPU to CPU and detach it from the graph
# Then convert to a numpy array of type float32
text_chunk_embeddings = np.array(text_chunk_embeddings.cpu(), dtype=np.float32)

d = 768
# setting up the vector store:
index = faiss.IndexFlatL2(d)
index.add(text_chunk_embeddings)

In [None]:
xq = np.random.random((10, d)).astype('float32') # create random query

k=4 #nearest 4 neighbours

D,I = index.search(xq, k) #return distances and indices for each query
print(I)
print(D)

## Search and Retreive 

In [None]:
# Implement a re-rank model

from sentence_transformers import CrossEncoder

reranking_model = CrossEncoder('mixedbread-ai/mxbai-rerank-large-v1')


In [None]:
# convert query to embeddings using the same embedding model used to embed the data documents.

query = 'macronutrients functions'

# embed the query
query_embed = embedding_model.encode(query, convert_to_tensor=True)
query_embed = query_embed.cpu().reshape(1,-1)

D,I = index.search(query_embed, k)


print(f'I : {I}')
print(f'D : {D}')

for dist, idx in zip(D[0], I[0]):
    print(f'Distance : {dist}')
    print(f'Text : {pages_and_chunks_over_min_token_len[idx]['sentence_chunk']}')
    print(f'Page number : {pages_and_chunks_over_min_token_len[idx]['page_number']}')

#### We could potentially improve the results by using a re-ranking model. The model is trained specifically to re-rank the search results and rank them in the order most likely.

In [None]:
retreived_docs = [pages_and_chunks_over_min_token_len[idx]['sentence_chunk'] for idx in I[0]]

results = reranking_model.rank(query, retreived_docs, return_documents=True, top_k=3)
results

In [None]:
def retrieve_relevant_resources(query, model, num_res_to_return):
    '''
    Embeds a query with the used model and returns top k scores and indices from vector store.
    '''

    # embed the query
    query_embed = model.encode(query, convert_to_tensor=True)
    query_embed = query_embed.cpu().reshape(1,-1)
    D,I = index.search(query_embed, num_res_to_return+5)
    retreived_docs = [pages_and_chunks_over_min_token_len[idx]['sentence_chunk'] for idx in I[0]]
    
    return reranking_model.rank(query, retreived_docs, return_documents=True, top_k=num_res_to_return)

#### Load LLM

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)


# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
model_id = "google/gemma-7b-it"
model_id = model_id 
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model) 
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                #  quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory 
                                                 attn_implementation=attn_implementation) # which attention version to use

# if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU 
llm_model.to("cuda:1")

In [None]:
def get_model_num_params(model):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

In [None]:
def get_model_mem_size(model):
    # get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # calculate model sizes
    model_mem_bytes = mem_params + mem_buffers
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3)

    return {
        'model_mem_bytes' : model_mem_bytes,
        'model_mem_mb' : round(model_mem_mb, 2),
        'model_mem_gb' : round(model_mem_gb, 2)
    }

get_model_mem_size(llm_model)

In [None]:
# List of questions
query_list = [
    'How can I tell if I’m getting enough micronutrients from fruits and vegetables?',
    'How does hydration influence overall energy levels and metabolism?',
    'What are some effective strategies for increasing daily fiber intake?',
    'What differences exist between plant-based proteins and animal-based proteins in terms of amino acid profiles?',
    'What is the relationship between gut health, the microbiome, and nutrient absorption?',
    'How often should infants be breastfed?',
    'What are the symptoms of pellagra?',
    'How does saliva help with digestion?',
    'What is the RDI for protein per day?',
    'Water soluble vitamins',
]

In [None]:
import random

query = random.choice(query_list)
print(f'Query : {query}')
prompt = query
# get the scores and indices from RAG
context_list = retrieve_relevant_resources(prompt, embedding_model, 5)

print(context_list)

In [None]:
# Augment our prompt with the context items:

# Prompting techniques to use : 
'''
1.Give clear intructions.
2.Give a few input/output examples:(Manual COT)
3.Ask to work the query, step by step:(Automatic COT), Give step by step reasoning.
'''

def prompt_formatter(query, context_items):
    context = '-'+'\n-'.join([item['text'] for item in context_items])
    base_prompt = base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
    prompt = base_prompt.format(context = context, query=query)
    return prompt

In [None]:
prompt = prompt_formatter(prompt, context_list)
print(prompt)

In [None]:
# Create a chat template
chat = [{
    'role' : 'user',
    'content' : prompt
}]

prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
print(prompt)

In [None]:

# tokenize the input text
input_ids = tokenizer(prompt, return_tensors='pt').to(device)

# Generate output from the local LLM
outputs = llm_model.generate(**input_ids, 
                             temperature=0.7,
                             do_sample=True,
                             max_new_tokens=256)
outputs



In [None]:
outputs_decoded = tokenizer.decode(outputs[0])
outputs_decoded = outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')
outputs_decoded

In [None]:
def ask(prompt,
        temperature=0.7,
        max_new_tokens=256,
        format_answer_only=True,
        return_answer_only=True):
    '''
    Takes the query, finds the relevant resources and generates the answer to the query based on the relevant resources from the private documents.
    '''

    # get the scores and indices from RAG
    context_list = retrieve_relevant_resources(prompt, embedding_model, 5)
    prompt = prompt_formatter(prompt, context_list)

    # Create a chat template
    chat = [{
        'role' : 'user',
        'content' : prompt
        }]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

    # tokenize the input text
    input_ids = tokenizer(prompt, return_tensors='pt').to(device)

    # Generate output from the local LLM
    outputs = llm_model.generate(**input_ids, 
                             temperature=temperature,
                             do_sample=True,
                             max_new_tokens=max_new_tokens)
    
    outputs_decoded = tokenizer.decode(outputs[0])
    if format_answer_only:
        outputs_decoded = outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')
    return outputs_decoded, context_list
    

In [None]:
ask('What are the fat-soluble vitamins?', temperature=0.5)