## Extraction

In [1]:
import fitz  # PyMuPDF


def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    pages = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        text = page.get_text().strip()
        pages.append({"page_number": page_number + 1, "text": text})
    doc.close()
    return pages


# Extract and print text
pdf_path = "./metformin1.pdf"
pages = extract_text_from_pdf(pdf_path)

## Chunking

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def chunk_pages(pages, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    all_chunks = []
    for page in pages:
        chunks = splitter.split_text(page["text"])
        all_chunks.extend(chunks)
    return all_chunks

In [3]:
chunks = chunk_pages(pages)

## Chunks Embedding

In [4]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import faiss
import numpy as np
import pickle

In [6]:

# Step 3: Embed the chunks
embeddings = embedding_model.encode(chunks, convert_to_numpy=True)

# Step 4: Create a FAISS index
d = embeddings.shape[1]  # dimension of embeddings
index = faiss.IndexFlatL2(d)  # L2 distance index
index.add(embeddings)  # add embeddings to index

print(f"Stored {index.ntotal} embeddings in the FAISS index.")

# Step 5: Save FAISS index + mapping to disk
faiss.write_index(index, "chunks.index")

# Also save chunks mapping (so you know which embedding corresponds to which text)
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

Stored 100 embeddings in the FAISS index.


In [7]:
# Load chunks mapping
with open("chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

# Encode query
query = "What is metformin used to treat?"
query_vec = embedding_model.encode([query], convert_to_numpy=True)

# Search
k = 3  # top-k results
D, I = index.search(query_vec, k)

for idx, score in zip(I[0], D[0]):
    print(f"Score: {score:.4f} | Chunk: {chunks[idx]}")

Score: 0.7914 | Chunk: metformin and may increase the risk for lactic acidosis. Consider the benefits and risks of 
concomitant use. Such interaction between metformin and oral cimetidine has been observed in 
normal healthy volunteers in both single- and multiple-dose, metformin-cimetidine drug 
interaction studies, with a 60% increase in peak metformin plasma and whole blood 
concentrations and a 40% increase in plasma and whole blood metformin AUC. There was no 
change in elimination half-life in the single-dose study. Metformin had no effect on cimetidine 
pharmacokinetics. 
In healthy volunteers, the pharmacokinetics of metformin and propranolol, and metformin and 
ibuprofen were not affected when coadministered in single-dose interaction studies. 
Metformin is negligibly bound to plasma proteins and is, therefore, less likely to interact with 
highly protein-bound drugs such as salicylates, sulfonamides, chloramphenicol, and probenecid,
Score: 0.8272 | Chunk: remain intact during

In [16]:
import requests


def ask_ollama(prompt):
    url = "http://ollama:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    payload = {"model": "tinyllama:latest", "prompt": prompt, "stream": False}

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()

    return response.json()["response"]

In [18]:
def build_prompt(query, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)
    prompt = f"""
You are a helpful assistant. Use the following context to answer the question.
If the answer is not in the context, say that you don't know.

Context:
{context}

Question: {query}

Answer:
"""
    return prompt

In [19]:
# Build RAG-style prompt
retrieved_chunks = [chunks[idx] for idx in I[0]]
prompt = build_prompt(query, retrieved_chunks)

# Ask Ollama
answer = ask_ollama(prompt)
print(answer)

In context, metformin is a medication used to treat type 2 diabetes mellitus (T2DM), which is characterized by high blood glucose levels. The pharmaceutical company Eli Lilly manufactures and markets the drug in both single-dose and multiple-dose forms. Metformin has been shown to decrease hepatic glucose production, increase insuliN sensitivity, and improve glucose tolerance in patients with T2DM. Its pharmacological mechanism of action is different from other classes of oral antidiabetic agents, which are more effective in improving glycemic control without affecting hepatotoxicity. Metformin's efficacy has been proven in various controlled clinical trials over multiple doses and has achieved steady state plasma concentrations with acceptable toxicity.
