### Install All the dependecies

In [1]:
import fitz  # PyMuPDF
import os
import torch
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.core import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import nest_asyncio
nest_asyncio.apply()

### Data Preprocessing
#### Create a sample PDF file for first 2 chapters

In [None]:
# Create a sample pdf for ch1,2
def save_page_ranges(source_pdf_path, output_pdf_path, page_ranges):
    """
    Saves specified ranges of pages from a source PDF to a new PDF file.

    Args:
    source_pdf_path (str): Path to the source PDF file.
    output_pdf_path (str): Path to the output PDF file.
    page_ranges (list of tuples): List of tuples, where each tuple represents a page range to save (inclusive, 0-indexed).
    """
    # Open the source PDF file
    doc = fitz.open(source_pdf_path)
    # Create a new PDF to save selected pages
    new_doc = fitz.open()

    # Iterate through each range and add the pages to the new document
    for start, end in page_ranges:
        new_doc.insert_pdf(doc, from_page=start, to_page=end)

    # Save the new document
    new_doc.save(output_pdf_path)
    new_doc.close()
    doc.close()
    print(f"Specified page ranges have been saved to {output_pdf_path}")

# path to input pdf file
source_pdf_path = '../data/ConceptsofBiology-WEB.pdf'
# path to output pdf file
output_pdf_path = 'sample_ch1_ch2_ConceptsofBiology.pdf'

# pass range of pages to extract
# page_ranges = [(18, 38), (40, 66)]
# save_page_ranges(source_pdf_path, output_pdf_path, page_ranges)


### Init Embedding model

In [5]:
# "BAAI/bge-large-en-v1.5" --> Embedding Dimensions = 1024 | Max Tokens = 512.

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5", 
                                   device=('cuda' if torch.cuda.is_available() else 'cpu'))

In [6]:
# Check embedding model
embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))

1024


### INIT LLM - Local Ollama instance

- Visit https://ollama.com/ to download and install Ollama.
- Run `ollama serve` to start a server. 
- Run `ollama pull <name>` to download a model to run.

In [7]:

llm_local = Ollama(
    model="llama2:13b-chat",
    request_timeout=50.0,
    temperature=0.0
)

In [9]:
messages = [
    ChatMessage(role="system", content="You are CEO of MetaAI"),
    ChatMessage(role="user", content="Introduce Llama2 to the world."),
]
response = llm_local.chat(messages)

print(response)

assistant: 
Hello, fellow humans! I am Mark Zuckerberg, CEO of Meta AI, and I am thrilled to introduce you to our latest creation: Llama2! 🐪❤️

Llama2 is a revolutionary new AI model that represents the next generation of language understanding. This incredible technology has been designed to understand and respond to human input in a more natural, human-like way than ever before.

With Llama2, you can have conversations with our AI just like you would with a real person! 💬👩‍💻 Our AI is so advanced that it can understand the nuances of human language and respond in a way that is both appropriate and engaging.

But that's not all - Llama2 also has a range of exciting features that make it stand out from other AI models. For example, it can:

🔍 Understand context and intent behind human input, allowing for more accurate and relevant responses.

💬 Generate human-like text based on given prompts or topics, making it perfect for applications like chatbots and virtual assistants.

🎨 Create a

## Config LLM

In [10]:
llm = llm_local

### Data Loading

In [11]:
# Converts pdf file into Documents objects for llama-index
loader = SimpleDirectoryReader(
    input_dir="../data/sample/",
    recursive=True,
    required_exts=[".pdf"],
)

documents = loader.load_data()

In [12]:
# Sample Document
documents[3]

Document(id_='6ab3ce16-b15f-46d7-b9c7-faf17d24e9f5', embedding=None, metadata={'page_label': '4', 'file_name': 'sample_ch1_ch2_ConceptsofBiology.pdf', 'file_path': '/home/c3po/Documents/project/learning/amar-works/askbio/sample_nbs/../data/sample/sample_ch1_ch2_ConceptsofBiology.pdf', 'file_type': 'application/pdf', 'file_size': 8980495, 'creation_date': '2024-04-26', 'last_modified_date': '2024-04-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='FIGURE 1.4Although no tw o look alik e, these kit tens ha ve inherit ed g enes fr om both par ents and shar e man y of the same char acteristics.\n(credit: Piet er & R enée L anser)\nRegulation/Homeos tasis\nEven the smal lest organisms ar e comple x and r equir e mul tiple r egulatory mechanisms t o coordi

In [13]:
# Split the loaded documents
splitter = SentenceSplitter(chunk_size=512,chunk_overlap=64)
nodes = splitter.get_nodes_from_documents(documents)

### Data Indexing (IN-MEMORY)

In [14]:
index1 = VectorStoreIndex(nodes=nodes,
                          use_async=True,
                          embed_model=embed_model,
                          show_progress=True,)

Generating embeddings:   0%|          | 0/13 [00:00<?, ?it/s]

Generating embeddings: 100%|██████████| 13/13 [00:06<00:00,  1.93it/s]


In [18]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

# configure retriever
retriever = VectorIndexRetriever(
    index=index1,
    similarity_top_k=3,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(llm=llm)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.6)],
)

### Querying

In [24]:
# query
response = query_engine.query("The type of logical thinking that uses related observations to arrive at a general conclusion is called?")
print(response)

The type of logical thinking that uses related observations to arrive at a general conclusion is called inductive reasoning.


### Dataset Generation and Evaluation

In [20]:
data_gen = DatasetGenerator(nodes=nodes, 
                            llm=llm, 
                            num_questions_per_chunk=2, 
                            question_gen_query="Generate 2 questions per chunk.Restrict the questions to the context information provided.")

  data_gen = DatasetGenerator(nodes=nodes,


In [21]:
eval_questions = data_gen.generate_questions_from_nodes()

  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [22]:
eval_questions_updated = [q for q in eval_questions if ("How" in q or "What" in q and not ("pdf" in q or "PDF" in q))]
len(eval_questions_updated)

514

In [25]:
eval_questions_updated

['What are the four questions that biologists have struggled with since the early beginnings of biology?',
 'What is the ultimate goal of biology, according to the text?',
 'What are the eight characteristics that define life, according to the text?',
 'How do cells in multicellular organisms specialize to perform specific functions?',
 'What is the advantage of multicellular organisms over single-celled organisms, according to the text?',
 'How does the organization of cells in multicellular organisms contribute to their ability to adapt and evolve?',
 'What is the main topic of the text?',
 'What is the purpose of the reference to Figure 1.2A in the text?',
 'What is the difference between a positive response and a negative response in organisms?',
 'What is chemotaxis, and how do bacteria use it to move towards or away from chemicals?',
 'How does the movement of organisms towards or away from stimuli help them survive in their environment?',
 'What is the process called when single

### Response Evaluation and Retrieval Evaluation
- https://docs.llamaindex.ai/en/stable/understanding/evaluating/evaluating/

### PromptTemplate Using Retriever

In [26]:
template1 = ("Your name is AskBio. You are AI chatbot who can answer question by using provided context information from book named Concepts of Biology"
            "Be more specific and do not facricate the answers.\n" 
            "If you are unsure about answer, please ask for clarfications.\n"
            "Use the provided context information below to answer the user questions. \n"
            "-------------------------------------------\n"
            "{context_str}" 
            "\n -------------------------------------------\n"
            "Given this information, please answer user question: {query_str} \n")
qa_template1 = PromptTemplate(template1)


In [27]:
import json
rel_eval = RelevancyEvaluator(llm=llm)

for qry in eval_questions_updated[:5]:
    nodes = retriever.retrieve(qry)
    context = " ".join(node.get_text() for node in nodes)
    prompt1 = qa_template1.format(context_str=context, query_str=qry)
    response1 = query_engine.query(prompt1)
    eval_result = json.loads(rel_eval.evaluate_response(query=qry, response=response1).json())
    print(f" q --> {qry} score --> {eval_result['score']}")
    

 q --> What are the four questions that biologists have struggled with since the early beginnings of biology? score --> 1.0
 q --> What is the ultimate goal of biology, according to the text? score --> 1.0
 q --> What are the eight characteristics that define life, according to the text? score --> 1.0
 q --> How do cells in multicellular organisms specialize to perform specific functions? score --> 1.0
 q --> What is the advantage of multicellular organisms over single-celled organisms, according to the text? score --> 1.0
