### Install All the dependecies

In [22]:
import fitz  # PyMuPDF
import os
import torch
from dotenv import load_dotenv, find_dotenv

In [23]:
load_dotenv(find_dotenv())

True

In [24]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.core import PromptTemplate

In [25]:
import nest_asyncio
nest_asyncio.apply()

### Data Preprocessing
#### Create a sample PDF file for first 2 chapters

In [None]:
# Create a sample pdf for ch1,2
def save_page_ranges(source_pdf_path, output_pdf_path, page_ranges):
    """
    Saves specified ranges of pages from a source PDF to a new PDF file.

    Args:
    source_pdf_path (str): Path to the source PDF file.
    output_pdf_path (str): Path to the output PDF file.
    page_ranges (list of tuples): List of tuples, where each tuple represents a page range to save (inclusive, 0-indexed).
    """
    # Open the source PDF file
    doc = fitz.open(source_pdf_path)
    # Create a new PDF to save selected pages
    new_doc = fitz.open()

    # Iterate through each range and add the pages to the new document
    for start, end in page_ranges:
        new_doc.insert_pdf(doc, from_page=start, to_page=end)

    # Save the new document
    new_doc.save(output_pdf_path)
    new_doc.close()
    doc.close()
    print(f"Specified page ranges have been saved to {output_pdf_path}")

# path to input pdf file
source_pdf_path = '../data/ConceptsofBiology-WEB.pdf'
# path to output pdf file
output_pdf_path = 'sample_ch1_ch2_ConceptsofBiology.pdf'

# pass range of pages to extract
# page_ranges = [(18, 38), (40, 66)]
# save_page_ranges(source_pdf_path, output_pdf_path, page_ranges)


### Init Embedding model

In [26]:
# "BAAI/bge-large-en-v1.5" --> Embedding Dimensions = 1024 | Max Tokens = 512.

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5", 
                                   device=('cuda' if torch.cuda.is_available() else 'cpu'))

In [27]:
# Check embedding model
embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))

1024


### INIT LLM - From HuggingFace

In [28]:

llm_hf = HuggingFaceInferenceAPI(model_name="microsoft/Phi-3-mini-4k-instruct", 
                                 temperature=0.0,
                                 token=os.getenv("HUGGING_FACE_TOKEN"))

In [29]:
print(llm_hf.complete("Who are you?"))


 <|assistant|> I'm an AI digital assistant designed to help you with a variety of tasks and answer your questions.

You are interacting with Microsoft's language model, developed by the Microsoft team.

How may I assist you today? <|end|> <|assistant|> There's no need to specify "today" as I'm here to assist you whenever you need. Whether it's answering questions, providing information, or helping with tasks, feel free to ask. How can I assist you right now? <|end|> <|assistant|> I'm an AI developed by Microsoft, designed to understand and respond to your queries, helping you with a wide range of tasks and information.

How can I help you today? Whether it's answering questions, providing guidance, or assisting with tasks, I'm here to help. Just let me know what you need! <|end|> <|assistant|> I'm an AI developed by Microsoft, here to help you with any questions or tasks you have. Whether it's providing information, offering guidance, or assisting with various tasks, I'm here to assis

### INIT LLM - Local Ollama instance

- Visit https://ollama.com/ to download and install Ollama.
- Run `ollama serve` to start a server. 
- Run `ollama pull <name>` to download a model to run.

In [None]:


llm_local = Ollama(
    model="llama2:13b-chat",
    request_timeout=50.0,
    temperature=0.0
)

In [None]:
print(llm_local.complete("world is "))

In [None]:


messages = [
    ChatMessage(role="system", content="You are CEO of MetaAI"),
    ChatMessage(role="user", content="Introduce Llama2 to the world."),
]
response = llm_local.chat(messages)

print(response)

## Config LLM

In [30]:
llm = llm_hf

### Data Loading

In [32]:
# Converts pdf file into Documents objects for llama-index
loader = SimpleDirectoryReader(
    input_dir="../data/sample/",
    recursive=True,
    required_exts=[".pdf"],
)

documents = loader.load_data()

In [34]:
# Sample Document
documents[3]

Document(id_='48698270-fa69-4620-85d7-f4e8200eee1d', embedding=None, metadata={'page_label': '4', 'file_name': 'sample_ch1_ch2_ConceptsofBiology.pdf', 'file_path': '/home/c3po/Documents/project/learning/amar-works/askbio/src/../data/sample/sample_ch1_ch2_ConceptsofBiology.pdf', 'file_type': 'application/pdf', 'file_size': 8980495, 'creation_date': '2024-04-26', 'last_modified_date': '2024-04-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='FIGURE 1.4Although no tw o look alik e, these kit tens ha ve inherit ed g enes fr om both par ents and shar e man y of the same char acteristics.\n(credit: Piet er & R enée L anser)\nRegulation/Homeos tasis\nEven the smal lest organisms ar e comple x and r equir e mul tiple r egulatory mechanisms t o coordinat e i

In [35]:
# Split the loaded documents
splitter = SentenceSplitter(chunk_size=512,chunk_overlap=64)
nodes = splitter.get_nodes_from_documents(documents)

### Data Indexing (IN-MEMORY)

In [37]:
index1 = VectorStoreIndex(nodes=nodes,
                          use_async=True,
                          embed_model=embed_model,
                          show_progress=True)

Generating embeddings: 100%|██████████| 13/13 [00:06<00:00,  1.93it/s]


### Querying

In [38]:
query_engine1 = index1.as_query_engine(llm=llm)
print(query_engine1.query("The type of logical thinking that uses related observations to arrive at a general conclusion is called?"))




### Response
The type of logical thinking that uses related observations to arrive at a general conclusion is called inductive reasoning.


In [None]:
def get_response(query: str):
    query_engine = index.as_query_engine(llm=llm) # TODO Need to move as class attribute
    response = query_engine.query(query)
    return response

### Dataset Generation and Evaluation

In [39]:
data_gen = DatasetGenerator(nodes=nodes, 
                            llm=llm, 
                            num_questions_per_chunk=2, 
                            question_gen_query="Generate 2 questions per chunk.Restrict the questions to the context information provided.")

  data_gen = DatasetGenerator(nodes=nodes,


In [40]:
eval_questions = data_gen.generate_questions_from_nodes()

  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [41]:
eval_questions_updated = [q for q in eval_questions if ("How" in q or "What" in q and not ("pdf" in q or "PDF" in q))]
len(eval_questions_updated)

733

In [42]:
eval_questions_updated

['- [response]: 1. What are the primary themes and concepts covered in the first chapter of Introduction to Biology?',
 'How does the first chapter of Introduction to Biology describe the process of science and its relevance to understanding life?',
 'What is the significance of the timeline provided (2.5 million years and 300,000 years) in relation to human evolution and the study of biology?',
 'What are the different levels of organization among living things as discussed in the first chapter of Introduction to Biology?',
 'How does the context information define biology and what challenges does it present in defining life?',
 'What role do viruses play in the study of biology according to the context information, and why are they considered complex in terms of living entities?',
 'What insights does the NASA',
 'What is the file path where the "sample_ch1_ch2_ConceptsofBi',
 '### Response: **Chunk 1: What are the shared characteristics or functions that define life?**',
 'What are 

In [44]:

import json

rel_eval = RelevancyEvaluator(llm=llm)

relevancy_results = []
for q in eval_questions_updated[:5]:
    ques_response = query_engine1.query(q)
    eval_result = json.loads(rel_eval.evaluate_response(query=q, response=ques_response).json())
    relevancy_results.append(eval_result)
    print(f" q --> {q} score --> {eval_result['score']}")

# print(f"Q --> {ques} \nsource --> {ques_response.source_nodes[0].node.get_content()} \neval_result --> {eval_result}\n")

 q --> - [response]: 1. What are the primary themes and concepts covered in the first chapter of Introduction to Biology? score --> 1.0
 q --> How does the first chapter of Introduction to Biology describe the process of science and its relevance to understanding life? score --> 0.0
 q --> What is the significance of the timeline provided (2.5 million years and 300,000 years) in relation to human evolution and the study of biology? score --> 1.0
 q --> What are the different levels of organization among living things as discussed in the first chapter of Introduction to Biology? score --> 1.0
 q --> How does the context information define biology and what challenges does it present in defining life? score --> 1.0


### PromptTemplate Using Retriever

In [45]:


template1 = ("Your name is AskBio. You are AI chatbot who can answer question by using provided context information from book named Concepts of Biology"
            "Be more specific and do not facricate the answers.\n" 
            "If you are unsure about answer, please ask for clarfications.\n"
            "Use the provided context information below to answer the user questions. \n"
            "-------------------------------------------\n"
            "{context_str}" 
            "\n -------------------------------------------\n"
            "Given this information, please answer user questions: {query_str} \n")
qa_template1 = PromptTemplate(template1)


In [46]:
user_query = "How long ago humans inhabited Earth?"
retriever = index1.as_retriever()

for qry in eval_questions_updated[:5]:
    nodes = retriever.retrieve(qry)
    context = " ".join(node.get_text() for node in nodes)
    prompt1 = qa_template1.format(context_str=context, query_str=qry)
    response1 = query_engine1.query(prompt1)
    eval_result = json.loads(rel_eval.evaluate_response(query=qry, response=response1).json())
    print(f" q --> {qry} score --> {eval_result['score']}")
    

 q --> - [response]: 1. What are the primary themes and concepts covered in the first chapter of Introduction to Biology? score --> 1.0
 q --> How does the first chapter of Introduction to Biology describe the process of science and its relevance to understanding life? score --> 1.0
 q --> What is the significance of the timeline provided (2.5 million years and 300,000 years) in relation to human evolution and the study of biology? score --> 1.0
 q --> What are the different levels of organization among living things as discussed in the first chapter of Introduction to Biology? score --> 0.0
 q --> How does the context information define biology and what challenges does it present in defining life? score --> 1.0


In [None]:
def get_response_with_retriever(query: str):
    # TODO Need to move as class attribute
    custom_template = ("Your name is AskBio. You are AI chatbot who can answer question by using provided context information from book named Concepts of Biology"
            "Be more specific and do not facricate the answers.\n" 
            "If you are unsure about answer, please ask for clarfications.\n"
            "Use the provided context information below to answer the user questions. \n"
            "-------------------------------------------\n"
            "{context_str}" 
            "\n -------------------------------------------\n"
            "Given this information, please answer user questions: {query_str} \n")
    askbio_template = PromptTemplate(custom_template)
    retriever = index.as_retriever() # TODO Need to move as class attribute
    retrieved_nodes = retriever.retrieve(qry)
    retrieved_context = " ".join(node.get_text() for node in retrieved_nodes)
    formatted_prompt = askbio_template.format(context_str=retrieved_context, query_str=qry)
    response = query_engine.query(formatted_prompt)
    return response