# Data to embeddings

Steps :
1. Get the pdf file path
2. Extract the text
3. Convert the text pages into sentences
4. Chunk the sentences
5. Convert into embeddings with the help of embeddings models

## Download the pdf

In [1]:
# to download the pdf

import os
import requests

def download_pdf(url : str,
                 filepath : str = "pdf/",
                 filename : str = "data.pdf",
                 verbose : bool = False):
    """download_pdf() is used to download the pdf file using the url"""
    file = filepath + filename

    # if there is no folder so we will create a folder
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        if verbose :
            print(f"[INFO] {filepath} is created successfully")

    if verbose :
        print(f"[INFO] {file} is downloading ...")

    response = requests.get(url)

    if response.status_code == 200:
        with open(file,"wb") as file:
            file.write(response.content)

        if verbose :
            print(f"[INFO] {file} is saved ...")
    else:
        print(f"[WARNING] Failed to get the pdf from the url : {url}")

In [2]:
filename = "human-nutrition-text.pdf"
filepath = "tests/"
url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
download_pdf(url,filepath,filename,True)

[INFO] tests/ is created successfully
[INFO] tests/human-nutrition-text.pdf is downloading ...
[INFO] <_io.BufferedWriter name='tests/human-nutrition-text.pdf'> is saved ...


## Convert the pdf into dict
Parameters:
1. Page number
2. Sentence_chunk
3. Character count
4. Word count
5. Token count
6. Embeddings

In [22]:
from spacy.lang.en import English
import fitz
from tqdm import tqdm
import re

def text_formatter(text : str) -> str:
    """ Convert the text that contains the /n with the space"""
    formatted_text = text.replace('\n',' ').strip()
    
    return formatted_text

def count_and_split_sentence(text : str) -> (int,list[str]):
    """To count and split the sentences from the given text """
    nlp = English()
    nlp.add_pipe("sentencizer")

    list_of_sentences = list(nlp(text).sents)
    list_of_sentences = [str(sentence) for sentence in list_of_sentences]

    return len(list_of_sentences),list_of_sentences

def open_pdf(filename : str,
             starting_page_number : int = 0) -> list[dict]:
    """convert the pdf into dict dtype"""

    doc = fitz.open(filename)
    data = []

    print("[INFO] Converting the pdf into dict dtype")
    for page_number,page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text = text)

        sentence_count,sentences = count_and_split_sentence(text)

        data.append(
            {
                "page_number" : page_number - starting_page_number,
                "char_count" : len(text),
                "word_count" : len(text.split(" ")),
                "sentence_count" : sentence_count,
                "token_count" : len(text) / 4,
                "sentence" : sentences,
                "text" : text
            }
        )

    return data

In [23]:
def split_the_array(array_list : list,
                    chunk_length : int) -> list[list[str]]:
    """Split the array of sentences into groups of chunks"""
    return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]

def convert_to_chunk(filepath : str = "pdf/",
                     filename : str = "data.pdf",
                     starting_page_number : int = 0,
                     chunk_size : int = 10) -> list[dict]:
    """ Convert the sentences into chunks """
    pages_and_texts = open_pdf(filepath+filename,starting_page_number)
    pages_and_chunks = []

    # splitting the chunks 
    print("[INFO] Splitting the sentences ")
    for item in tqdm(pages_and_texts):
        item["sentence_chunks"] = split_the_array(item["sentence"],chunk_size)
        item["chunk_count"] = len(item["sentence_chunks"])

    # splitting the chunks
    print("[INFO] Splitting into chunks ")
    for item in tqdm(pages_and_texts):
        for chunks in item["sentence_chunks"]:
            d = {}
            d["page_number"] = item["page_number"]
            
            joined_sentence = "".join(chunks).replace("  "," ").strip()
            joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends

            d["sentence_chunk"] = joined_sentence
            # stats
            d["char_count"] = len(joined_sentence)
            d["word_count"] = len(list(joined_sentence.split(" ")))
            d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word

            pages_and_chunks.append(d)

    return pages_and_chunks

## Convert the chunks into embeddings

In [35]:
from torch.cuda import is_available
from torch import device
from sentence_transformers import SentenceTransformer

embedding_model = "all-mpnet-base-v2"
device = device('cuda' if is_available() else 'cpu')

def convert_to_embedds(model_name : str,
                       device : str,
                       filepath : str = "pdf/",
                       filename : str = "data.pdf",
                       starting_page_number : int = 0,
                       chunk_size = 10) -> list[dict] :
    
    data = convert_to_chunk(filepath,
                        filename,
                        starting_page_number,
                        chunk_size)
    
    embedding_model = SentenceTransformer(model_name_or_path = model_name,device = device)
    print("[INFO] Converting into embeddings ")
    for item in tqdm(data):
        item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)

    return data

In [36]:
data = convert_to_embedds(embedding_model,device,"tests/","human-nutrition-text.pdf",0,10)

[INFO] Converting the pdf into dict dtype


1208it [04:17,  4.69it/s]


[INFO] Splitting the sentences 


100%|██████████| 1208/1208 [00:00<00:00, 528609.21it/s]


[INFO] Splitting into chunks 


100%|██████████| 1208/1208 [00:00<00:00, 24636.03it/s]


[INFO] Converting into embeddings 


100%|██████████| 1843/1843 [14:29<00:00,  2.12it/s]


## Save the dict in a file

In [38]:
import pandas as pd
def save_the_embeddings(filename : str = "embeddings.csv",
                        filepath : str = "pdf/",
                       data : list[dict] = None,
                       verbose = False,
                       embedding_model : str = "all-mpnet-base-v2",
                       device :str = 'cpu'):
    embedd_file = filepath + filename
    if data is None:
        data = convert_to_embedds(embedding_model,device,"tests/","human-nutrition-text.pdf",0,10)
    dataframe = pd.DataFrame(data)
    dataframe.to_csv(embedd_file,index = False)

    if verbose :
        print(f"[INFO] {embedd_file} is successfully saved")

In [40]:
save_the_embeddings(filename = "embeddings.csv",filepath="tests/",data=data,verbose = True,embedding_model = "all-mpnet-base-v2",device = device)

[INFO] tests/embeddings.csv is successfully saved


# Retrieval to generation
Steps:
1. Get the embeddings in tensor
2. Do similarity search
3. Initialize the LLM
4. Prompt
5. Generation

In [4]:
import textwrap
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer,util


user_query = "processed foods"
model_name = "all-mpnet-base-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
data_filename = "tests/embeddings.csv"
data_pd = pd.read_csv(data_filename)
data_dict = pd.read_csv("tests/embeddings.csv").to_dict(orient='records')


embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2", device = device)


def get_embeddings(data,device) -> list:
    """Returns the embeddings from the csv file"""
    data_embeddings = []

    for tensor_str in data_pd["embeddings"]:
        values_str = tensor_str.split("[")[1].split("]")[0]
        values_list = [float(val) for val in values_str.split(",")]
        tensor_result = torch.tensor(values_list)
        data_embeddings.append(tensor_result)

    data_embeddings = torch.stack(data_embeddings).to(device)
    return data_embeddings


def wrap_the_text(text : str,wrap_length : int = 100):
    """wrap_the_text prettify the text"""
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)
    
def print_retrieved_text(query : str,
                         score,
                         index : int,
                         pages_and_chunks = data_dict):
    """Prints the retrieved texts"""
    print(f"Query : {query}")
    for score, index in zip(score,index):
        print(f"Score : {score}")
        print("The Text :")
        print(f"Index : {index}")
        wrap_the_text(pages_and_chunks[index]["sentence_chunk"])

def display_the_page(index : int , 
                     starting_page_count : int,
                     filename : str = "human-nutrition-text.pdf",
                     filepath : str = "tests/"):
    """Displays the content of the pdf in Image"""
    doc = fitz.open(filepath + filename)
    page = doc.load_page(index + starting_page_count) 

    # convert page text into np.array
    img = page.get_pixmap(dpi=300)
    doc.close()
    img_array = np.frombuffer(img.samples_mv,dtype=np.uint8).reshape((img.h, img.w, img.n))

    # display the image by matplotlib
    plt.imshow(img_array)
    plt.axis("off")
    plt.show()

def retrieve_relevant_resource( user_query : str ,
                               embeddings, 
                               embedding_model, 
                               device,
                               k = 5,
                               to_print : bool = False,
                               to_display : bool = False):
    """Function to retrieve relevant resource"""
    
    query_embedding = embedding_model.encode(user_query, convert_to_tensor = True).to(device)

    dot_score = util.dot_score( a = query_embedding, b = embeddings)[0]
    score , idx = torch.topk(dot_score,k=k)

    if to_print:
        print_retrieved_text(user_query,score,idx)

    if to_display:
        for scores,index in zip(score,idx):
            print(f"Score : {scores}")
            print(f"Page number : {data_dict[index]['page_number']}")
            try :
                display_the_page(data_dict[index]['page_number'],41)
            except ValueError:
                pass
            
    return score,idx



## LLM

In [5]:
data_embeddings = get_embeddings(data_pd,device)

retrieve_relevant_resource(
    user_query,
    data_embeddings, 
    embedding_model, 
    device,
    k = 5,
    to_print = True,
    to_display = False
)

Query : processed foods
Score : 0.6357370615005493
The Text :
Index : 1603
creating products that have a much longer shelf life than raw foods. Also, food processing protects
the health of the consumer and allows for easier shipment and the marketing of foods by
corporations. However, there are certain drawbacks. Food processing can reduce the nutritional
content of raw ingredients. For example, canning involves the use of heat, which destroys the
vitamin C in canned fruit. Also, certain food additives that are included during processing, such as
high fructose corn syrup, can affect the health of a consumer. However, the level of added sugar can
make a major difference. Small amounts of added sugar and other sweeteners, about 6 to 9 teaspoons a
day or less, are not considered harmful.1 Food Additives If you examine the label for a processed
food product, it is not unusual to see a long list of added materials. These natural or synthetic
substances are food additives and there are more 

(tensor([0.6357, 0.5949, 0.5776, 0.5539, 0.5416]),
 tensor([1603, 1602, 1592,  427, 1654]))

In [6]:
from transformers import AutoTokenizer , AutoModelForCausalLM

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# LLM
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16).to(device)

In [4]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

print(f" Parameters Count : {get_model_num_params(llm_model)}")

 Parameters Count : 1100048384


In [5]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 2211633920, 'model_mem_mb': 2109.18, 'model_mem_gb': 2.06}

## Prompt Engg

In [7]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
Use the following example as reference for the ideal answer style.
\nExample :
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

## Generate

In [8]:
user_text = "What is malnutrition?"
device = "cuda" if torch.cuda.is_available() else "cpu"
    # Get relevant resources
scores, indices = retrieve_relevant_resource(user_text,data_embeddings, embedding_model, device)

In [9]:
# Create a list of context items
context_items = [data_dict[i] for i in indices]

In [10]:
# Format prompt with context items
prompt = prompt_formatter(query=user_text,
                              context_items=context_items)

In [11]:
input_ids = tokenizer(prompt, return_tensors="pt").to(device)

In [None]:
%%time
# Generate an output of tokens
outputs = llm_model.generate(**input_ids,max_new_tokens=256) 

In [None]:
# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])
output_text = output_text.split("<|assistant|>")

output_text

In [None]:
def query(user_text : str):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    scores, indices = retrieve_relevant_resource(user_text,data_embeddings, embedding_model, device)
    context_items = [data_dict[i] for i in indices]
    prompt = prompt_formatter(query=user_text,context_items=context_items)
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = llm_model.generate(**input_ids,max_new_tokens=256) 
    output_text = tokenizer.decode(outputs[0])
    output_text = output_text.split("<|assistant|>")
    
    return output_text

# Gradio app