<a href="https://colab.research.google.com/github/Mehulgoyal353/NutriChat/blob/main/RAG_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing required libraries

In [None]:
import os

if "COLAB_GPU" in os.environ:
  print("[INFO] Running in Google Collab, installing requirements.")
  !pip install -U torch
  !pip install PyMuPDF
  !pip install tqdm
  !pip install sentence-transformers
  !pip install accelerate
  !pip install bitsandbytes
  !pip install flash-attn --no-build-isolation

[INFO] Running in Google Collab, installing requirements.
Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.1/779.1 MB[0m [31m7.5 MB/s[0m eta [36m0:01:23[0m

# Reading and saving the required pdf

In [None]:
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
  filename = pdf_path
  response = requests.get(url)

  if response.status_code == 200:
    with open(filename, "wb") as file:
      file.write(response.content)
    print(f"The file has been downloaded ans saved as {filename}")
  else:
    print(f"Failed to download this file. Status code: {response.status_code}")

else:
  print(f"File {pdf_path} exists.")

#Basic data pre processing

In [None]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
  cleaned_text = text.replace("\n", " ").strip()
  return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
  doc = fitz.open(pdf_path)
  pages_and_texts = []
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text)
    pages_and_texts.append({"page_number": page_number - 41,
                          "page_char_count": len(text),
                          "page_word_count": len(text.split(" ")),
                          "page_sentence_count": len(text.split(".")),
                          "page_token_count": len(text)/4,
                          "text": text})
  return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path = pdf_path)
pages_and_texts[0]

In [None]:
import random

random.sample(pages_and_texts, k = 3)

#Converting raw data into dataframe

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

#Separating sentences using Spacy's Sentencizer

In [None]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This is another sentence.")
assert len(list(doc.sents)) == 2

list(doc.sents)

In [None]:
for item in tqdm(pages_and_texts):
  item["sentences"] = list(nlp(item["text"]).sents)
  item["sentences"] = [str(sentence) for sentence in item["sentences"]]
  item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
random.sample(pages_and_texts, k = 1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

#Forming chunks of data

In [None]:
num_sentence_chunk_size = 10;
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
  return [input_list[i: i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_texts):
  item["sentence_chunks"] = split_list(input_list = item["sentences"],
                                       slice_size = num_sentence_chunk_size)
  item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
random.sample(pages_and_texts, k = 1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

#Formatting to get chunk data

In [None]:
import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
  for sentence_chunk in item["sentence_chunks"]:
    chunk_dict = {}
    chunk_dict["page_number"] = item["page_number"]

    joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

In [None]:
random.sample(pages_and_chunks, k = 1)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

#Removing pages with useless data

In [None]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
  print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

In [None]:
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient = "records")
pages_and_chunks_over_min_token_length[:2]

#Embedding the processed data

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path = "sentence-transformers/all-mpnet-base-v2",
                                      device = "cuda")
sentences = [
    "My name is Mehul Goyal.",
    "A few of my hobbies are music and watching anime.",
    "This is an example list to check how the mpnet embedding model works.",
    "I have nothing else to write, you know?"
]
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
  print("Sentence: ", sentence)
  print("Embedding: ", embedding)
  print("Embedding size: ", embedding.shape)
  print("")

In [None]:
%%time

embedding_model.to("cuda")
for item in tqdm(pages_and_chunks_over_min_token_length):
  item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [None]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_length]

In [None]:
%%time

text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size = 32,
                                               convert_to_tensor = True)
text_chunk_embeddings

In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embeddings_df_save_path = "text_chunks_and_embeddings.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index = False)

In [None]:
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

#Adding the embeddings to the text_and_chunks dictionary

In [None]:
import random

import numpy as np
import pandas as pd
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings.csv")
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep = " "))

pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient = "records")

embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].to_list()), dtype = torch.float32).to(device)
embeddings.shape

In [None]:
text_chunks_and_embeddings_df.head()

In [None]:
embeddings[0]

#Embedding the input query

In [None]:
from sentence_transformers import SentenceTransformer, util

embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2",
                                      device = device)

In [None]:
query = "macronutrients functions"
print(f"Query: {query}")

query_embedding = embedding_model.encode(query, convert_to_tensor = True)

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a = query_embedding, b = embeddings)[0]
end_time = timer()

print(f"Time taken to get score on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

top_results_dot_product = torch.topk(dot_scores, k = 5)
top_results_dot_product

In [None]:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embeddings shape: {larger_embeddings.shape}")

start_time = timer()
dot_scores = util.dot_score(a = query_embedding, b = larger_embeddings)[0]
end_time = timer()

print(f"Time taken to get scores on {len(larger_embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

#Formatting the query

In [None]:
import textwrap

def print_wrapped(text, wrap_length = 80):
  wrapped_text = textwrap.fill(text, wrap_length)
  print(wrapped_text)

In [None]:
print(f"Query: '{query}'\n")
print("Results:")

for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
  print(f"Score: {score:.4f}")
  print("Text:")
  print_wrapped(pages_and_chunks[idx]["sentence_chunk"])

  print(f"Page number: {pages_and_chunks[idx]['page_number']}")
  print("\n")

In [None]:
import fitz

pdf_path = "human-nutrition-text.pdf"
doc = fitz.open(pdf_path)
page = doc.load_page(5 + 41)

img = page.get_pixmap(dpi = 300)
doc.close()

img_array = np.frombuffer(img.samples_mv,
                          dtype = np.uint8).reshape((img.h, img.w, img.n))

import matplotlib.pyplot as plt
plt.figure(figsize = (13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis('off')
plt.show()

#Retrieving chunks of data related to the query

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer = embedding_model,
                                n_resources_to_return: int = 5,
                                print_time: bool = True):
  query_embedding = model.encode(query,
                                 convert_to_tensor = True)

  start_time = timer()
  dot_scores = util.dot_score(query_embedding, embeddings)[0]
  end_time = timer()

  if print_time:
    print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings : {end_time - start_time:.5f} seconds.")

  scores, indices = torch.topk(input = dot_scores,
                               k = n_resources_to_return)

  return scores, indices

In [None]:
def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict] = pages_and_chunks,
                                 n_resources_to_return: int = 5):

  scores, indices = retrieve_relevant_resources(query = query,
                                                embeddings = embeddings,
                                                n_resources_to_return = n_resources_to_return)
  print(f"Query: {query}\n")
  print("Results:")

  for score, index in zip(scores, indices):
    print(f"Score: {score:.4f}")
    print_wrapped(pages_and_chunks[index]["sentence_chunk"])
    print(f"Page number: {pages_and_chunks[index]['page_number']}")
    print("\n")

In [None]:
query = "symptoms of Pellagra"

scores, indices = retrieve_relevant_resources(query = query,
                                              embeddings = embeddings)
scores, indices

In [None]:
print_top_results_and_scores(query = query,
                             embeddings = embeddings)

#Checking for the most optimal LLM according to the available GPU memory

In [None]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

In [None]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

#Getting the HuggingFace user token to access the model's gated repo

In [None]:
from google.colab import auth
auth.authenticate_user()

import os
os.environ['HUGGINGFACE_TOKEN'] = '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'

#Getting the model ready

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
from huggingface_hub import login

login(token=os.getenv('HUGGINGFACE_TOKEN'))

quanitization_config = BitsAndBytesConfig(load_in_4bit = True,
                                          bnb_4bit_compute_dtype = torch.float16)

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0]>=8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"

print(f"[INFO] Using attention implementation: {attn_implementation}")

model_id = model_id
print(f"[INFO] Using model id: {model_id}")

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id, use_auth_token=True)

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id,
                                                 use_auth_token=True,
                                                 torch_dtype = torch.float16,
                                                 quantization_config = quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage = False,
                                                 attn_implementation = attn_implementation)
if not use_quantization_config:
  llm_model.to("cuda")

In [None]:
llm_model

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

In [None]:
def get_model_mem_size(model: torch.nn.Module):

    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    model_mem_bytes = mem_params + mem_buffers
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3)

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

#Making a chat template for the query

In [None]:
input_text = "What is micronutrients?"
print(f"Input text:\n{input_text}")

dialogue_template = [
    {"role": "user",
     "content": input_text}
]

prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256)
print(f"Model output (tokens):\n{outputs[0]}\n")

In [None]:
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

In [None]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

In [None]:
# gpt4_questions = [
#     "What are the macronutrients, and what roles do they play in the human body?",
#     "How do vitamins and minerals differ in their roles and importance for health?",
#     "Describe the process of digestion and absorption of nutrients in the human body.",
#     "What role does fibre play in digestion? Name five fibre containing foods.",
#     "Explain the concept of energy balance and its importance in weight management."
# ]

# manual_questions = [
#     "How often should infants be breastfed?",
#     "What are symptoms of pellagra?",
#     "How does saliva help with digestion?",
#     "What is the RDI for protein per day?",
#     "water soluble vitamins"
# ]

# query_list = gpt4_questions + manual_questions

In [None]:
# import random
# query = random.choice(query_list)

# print(f"Query: {query}")

# scores, indices = retrieve_relevant_resources(query=query,
#                                               embeddings=embeddings)
# scores, indices

In [None]:
# def prompt_formatter(query: str,
#                      context_items: list[dict]) -> str:

#     context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

#     base_prompt = """Based on the following context items, please answer the query.
# Give yourself room to think by extracting relevant passages from the context before answering the query.
# Don't return the thinking, only return the answer.
# Make sure your answers are as explanatory as possible.
# Use the following examples as reference for the ideal answer style.
# \nExample 1:
# Query: What are the fat-soluble vitamins?
# Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
# \nExample 2:
# Query: What are the causes of type 2 diabetes?
# Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
# \nExample 3:
# Query: What is the importance of hydration for physical performance?
# Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
# \nNow use the following context items to answer the user query:
# {context}
# \nRelevant passages: <extract relevant passages from the context here>
# User query: {query}
# Answer:"""

#     base_prompt = base_prompt.format(context=context, query=query)

#     dialogue_template = [
#         {"role": "user",
#         "content": base_prompt}
#     ]

#     prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
#                                           tokenize=False,
#                                           add_generation_prompt=True)
#     return prompt

In [None]:
# query = random.choice(query_list)
# print(f"Query: {query}")

# scores, indices = retrieve_relevant_resources(query=query,
#                                               embeddings=embeddings)

# context_items = [pages_and_chunks[i] for i in indices]

# prompt = prompt_formatter(query=query,
#                           context_items=context_items)
# print(prompt)

In [None]:
# def ask(query,
#         temperature=0.7,
#         max_new_tokens=512,
#         format_answer_text=True,
#         return_answer_only=True):

#     scores, indices = retrieve_relevant_resources(query=query,
#                                                   embeddings=embeddings)

#     context_items = [pages_and_chunks[i] for i in indices]

#     for i, item in enumerate(context_items):
#         item["score"] = scores[i].cpu()

#     prompt = prompt_formatter(query=query,
#                               context_items=context_items)

#     input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

#     outputs = llm_model.generate(**input_ids,
#                                  temperature=temperature,
#                                  do_sample=True,
#                                  max_new_tokens=max_new_tokens)

#     output_text = tokenizer.decode(outputs[0])

#     if format_answer_text:
#         output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

#     if return_answer_only:
#         return output_text

#     return output_text, context_items

In [None]:
# query = random.choice(query_list)
# print(f"Query: {query}")

# answer, context_items = ask(query=query,
#                             temperature=0.7,
#                             max_new_tokens=512,
#                             return_answer_only=False)

# print(f"Answer:\n")
# print_wrapped(answer)
# print(f"Context items:")
# context_items