In [1]:
import os
from pathlib import Path
import pickle
import pprint
import numpy as np
import faiss
from typing import List,Dict

#Embedding libs(BGE first)
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base = Path.cwd()
data_dir = base / "data"
index_dir = base / "vector_store"
index_dir.mkdir(exist_ok=True)
data_dir.mkdir(exist_ok=True)

#embedding model name
EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"
FALLBACK_MDEL_NAME = "all-MiniLM-L6-v2"

In [3]:
#Ollama
import requests
import json
OLLAMA_API_URL = "http://127.0.0.1:11434/api/generate"
OLLAMA_MODEL = "qwen2.5:1.5b"

payload = {
    "model": OLLAMA_MODEL,
    "prompt": "Hello, Ollama!",
}

response = requests.post(OLLAMA_API_URL, json=payload)

#splitting
lines = response.text.splitlines()
final_text = ""
for line in lines:
    data = json.loads(line)
    final_text += data.get("response", "")

print(final_text)    



Hello there! How can I assist you today? Is there anything in particular you'd like to discuss or learn about?


In [4]:
#Chuknking 
chunk_size = 800
chunk_overlap = 150
k = 5  #number of nearest neighbors to retrieve

In [5]:
#load embedding model
def load_embedding_model():
#try to load BGE model, if fails, fallback to MiniLM
    try:
        print("Loading BGE model:", EMBEDDING_MODEL_NAME)
        model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        print("Successfully loaded BGE model.")
        return model
    except Exception as e:
        print("Failed to load BGE model. Error:", str(e))
        print("Falling back to MiniLM model:", FALLBACK_MDEL_NAME)
        model = SentenceTransformer(FALLBACK_MDEL_NAME)
        print("Successfully loaded MiniLM model.")
        return model

EMBED_NAME = load_embedding_model

In [6]:
#pdf reading
from pypdf import PdfReader
def read_pdf(file_path: str) -> str:
    reader = PdfReader(file_path)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)

print("Current PDFs in data/:", list(data_dir.glob("*.[Pp][Dd][Ff]")))


Current PDFs in data/: [WindowsPath('c:/Users/Administrator/Desktop/RAG/notebook/data/Highlights.pdf')]


In [7]:
from pathlib import Path
data_dir = Path("data")  # folder named "data" in your current directory




In [8]:
print(data_dir)

data


In [9]:
print(data_dir.exists())


True


In [10]:
print(list(data_dir.iterdir()))


[WindowsPath('data/Highlights.pdf')]


In [11]:
#cleaning
def clean_text(text: str):
    return " ".join(text.split())


In [12]:
def chunk_text(text: str, chunk_size=chunk_size, overlap=chunk_overlap):
    text = clean_text(text)
    chunks = []       # list to store all chunks
    start = 0
    length = len(text)
    while start < length:
        end = min(start + chunk_size, length)
        piece = text[start:end].strip()   # chunk text piece
        if piece:
            chunks.append(piece)
        if end == length:
            break
        start = end - overlap             # sliding window
    return chunks




In [13]:
def embed_texts(texts: List[str]):
    embs = EMBED_NAME().encode(texts, convert_to_numpy=True, show_progress_bar=True)
    return embs.astype("float32")

In [14]:
import pickle

In [15]:
pdf_path = list(data_dir.glob("*.[Pp][Dd][Ff]"))[0]

text = read_pdf(str(pdf_path))
cleaned = clean_text(text)

chunks = chunk_text(cleaned)  

docs = [
    {"doc_id": i, "chunk": piece}
    for i, piece in enumerate(chunks)
]

print("total chunks created:", len(docs))


total chunks created: 65


In [16]:
def build_faiss_index(docs: List[Dict]):
    """docs : docs id and chunks
    saves in 1. vector_store/ faiss.index
    2. vector_score/metadata.pkl
    """
    index_path = index_dir / "faiss.index"
    metadata_path = index_dir / "metadata.pkl"

    texts = [d['chunk'] for d in docs]
    metas = [{"doc_id": d["doc_id"], "chunk" : d["chunk"]} for d in docs]  

    embeddings = embed_texts(texts)
    dimension = embeddings.shape[1]

    if index_path.exists():
        index = faiss.read_index(str(index_path))
        old_metadata = pickle.load(open(metadata_path, "rb"))
        index.add(embeddings)
        metas = old_metadata + metas
    else:
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)

    faiss.write_index(index, str(index_path))
    with open(metadata_path, "wb") as f:
        pickle.dump(metas, f)

    print(f"Saved FAISS index with {index.ntotal} vectors.")

def ingest_folder(folder: Path= data_dir):
    pdf_files = list(folder.glob("*.[Pp][Dd][Ff]"))
    all_docs = []
    for pdf_file in pdf_files:
        doc_id = pdf_file.name
        print(f"Processing {doc_id}...")
        text = read_pdf(str(pdf_file))
        chunks = chunk_text(text)
        for c in chunks:
            all_docs.append({"doc_id": doc_id, "chunk": c})
    if not all_docs:
        print("No documents found to ingest.")
        return
    build_faiss_index(all_docs)
    


In [17]:
#load index and retrieve
def load_faiss_index():
    index_path = index_dir / "faiss.index"
    metadata_path = index_dir / "metadata.pkl"
    if not index_path.exists() or not metadata_path.exists():
        raise FileNotFoundError("FAISS index or metadata not found. Please ingest data first.")
    index = faiss.read_index(str(index_path))
    with open(metadata_path, "rb") as f:
        metadata = pickle.load(f)
    return index, metadata

def embed_query(query: str):
    emb = EMBED_NAME().encode([query], convert_to_numpy=True)
    return emb.astype("float32")

def retrieve_similar_chunks(query: str, k=k):
    index, metadata = load_faiss_index()
    query_emb = embed_query(query)
    distances, indices = index.search(query_emb, k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx <0 or idx >= len(metadata):
            continue
        meta = metadata[idx]
        results.append({"score": float(dist), "doc_id": meta.get("doc_id"), "chunk": meta.get("chunk")})

    return results    

In [26]:
import requests
import json
def build_prompt(question: str, contexts: List[Dict]):
      header =("You are an AI assistant helping users find information from provided document excerpts.If the answer is not in the context , say I don't know based on the provided documents,  be concise Use the following excerpts to answer the question as accurately as possible.\n\n") 
      ctx_text = "\n\n---\n\n".join([f"Source: {c['doc_id']}\n\n{c['chunk']}" for c in contexts])
      prompt = f"{header}{ctx_text}\n\nQuestion: {question}\nAnswer:"
      return prompt

def call_ollama(prompt: str, model : str = OLLAMA_MODEL, max_tokens: int = 512, temperature: float = 0.0):
    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stream": False
    }
    try:
        response = requests.post(OLLAMA_API_URL, json=payload, timeout=150)
        response.raise_for_status()
        data = response.json()

        return data.get("response", "").strip()
    except Exception as e:
        print("Error calling Ollama API:", str(e))
        return""
    

In [27]:
def generate_answer(question: str, top_: int = k):
    contexts = retrieve_similar_chunks(question, k=top_)
    prompt = build_prompt(question, contexts)
    answer = call_ollama(prompt)
    if not answer:
        answer = "Error: Ollama did not return output"
    return{"answer": answer.strip(), "contexts": contexts}

#demo 
question = "What is the main topic discussed in the document?"
result = generate_answer(question)
print("Answer:", result["answer"])
print("\nContexts:")
pprint.pprint(result["contexts"])

    

Loading BGE model: BAAI/bge-small-en
Successfully loaded BGE model.
Answer: The main topic discussed in the document is a report on Pakistan's economic performance and current situation, including topics such as inflation, investment, agriculture, foreign exchange rates, economic resilience efforts, public health expenditure, educational reforms, and more. The document aims to provide an overview of the nation’s economic trends and challenges based on data from various sources and sectors within the government.

Contexts:
[{'chunk': 'strategic initiatives, based on data from the preceding fiscal '
           'year and up to the third quarter of the current fiscal year. '
           'Through rigorous analysis, the Survey aims to inform evidence- '
           'based policymaking and enrich public discourse on Pakistan’s '
           'economic trajectory. The timely preparation of this extensive '
           'document would not have been possible without the collaboration '
           'an

In [21]:
from pathlib import Path
print("Index exists:",Path("vector_store/faiss.index").exists())
print("Metadata exists:",Path("vector_store/metadata.pkl").exists())

Index exists: False
Metadata exists: False


In [22]:
ingest_folder()

Processing Highlights.pdf...
Loading BGE model: BAAI/bge-small-en
Successfully loaded BGE model.


Batches: 100%|██████████| 3/3 [00:16<00:00,  5.63s/it]

Saved FAISS index with 65 vectors.



