### Data Preprocessing

In [1]:
import fitz  # PyMuPDF
import os
import torch
from dotenv import load_dotenv, find_dotenv

In [None]:
# Create a sample pdf for ch1,2
def save_page_ranges(source_pdf_path, output_pdf_path, page_ranges):
    """
    Saves specified ranges of pages from a source PDF to a new PDF file.

    Args:
    source_pdf_path (str): Path to the source PDF file.
    output_pdf_path (str): Path to the output PDF file.
    page_ranges (list of tuples): List of tuples, where each tuple represents a page range to save (inclusive, 0-indexed).
    """
    # Open the source PDF file
    doc = fitz.open(source_pdf_path)
    # Create a new PDF to save selected pages
    new_doc = fitz.open()

    # Iterate through each range and add the pages to the new document
    for start, end in page_ranges:
        new_doc.insert_pdf(doc, from_page=start, to_page=end)

    # Save the new document
    new_doc.save(output_pdf_path)
    new_doc.close()
    doc.close()
    print(f"Specified page ranges have been saved to {output_pdf_path}")

# path to input pdf file
source_pdf_path = '../data/ConceptsofBiology-WEB.pdf'
# path to output pdf file
output_pdf_path = 'sample_ch1_ch2_ConceptsofBiology.pdf'

# pass range of pages to extract
page_ranges = [(18, 38), (40, 66)]
save_page_ranges(source_pdf_path, output_pdf_path, page_ranges)


### TODO 
- Get unstractured pdf data into structured format like JSON

### Embedding model

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5", device=('cuda' if torch.cuda.is_available() else 'cpu'))

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))

1024


### LLM

In [None]:
from llama_index.llms.huggingface import HuggingFaceInferenceAPI

llm_hf = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=HF_TOKEN)

print(remotely_run_anon.complete("To infinity, and"))

In [2]:
from llama_index.llms.ollama import Ollama

phi3 = Ollama(
    model="phi3:instruct",
    request_timeout=40.0,
    temperature=0.0
)

In [3]:
print(phi3.complete("what is computer"))

In [None]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

loader = SimpleDirectoryReader(
    input_dir="../data/sample/",
    recursive=True,
    required_exts=[".pdf"],
)

documents = loader.load_data()

In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embedding_model,
)