### Dependencies 

In [None]:
# Use your environment manager; example with pip
!pip install python-docx PyMuPDF mail-parser langchain faiss-cpu sentence-transformers rank_bm25
!pip install transformers accelerate
!pip install sentence-transformers[torch]  # if needed
!pip install cross-encoder  # or use sentence-transformers CrossEncoder


[31mERROR: Could not find a version that satisfies the requirement mailparser (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for mailparser[0m[31m
Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.7/374.7 KB[0m [31m733.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: accelerate
Successfully installed accelerate-1.10.0
zsh:1: no matches found: sentence-transformers[torch]
[31mERROR: Could not find a version that satisfies the requirement cross-encoder (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cross-encoder[0m[31m
[0m

In [None]:
!pip install mail-parser  # if needed

Collecting mail-parser
  Downloading mail_parser-4.1.4-py3-none-any.whl (27 kB)
Installing collected packages: mail-parser
Successfully installed mail-parser-4.1.4


In [8]:
!pip install cross-encoder  # or use sentence-transformers CrossEncoder

[31mERROR: Could not find a version that satisfies the requirement cross-encoder (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cross-encoder[0m[31m
[0m

### Document Ingestion

In [9]:
import fitz  # PyMuPDF
import docx
import mailparser
import re
from typing import List, Dict

def extract_pdf_with_structure(pdf_path: str) -> List[Dict]:
    """Returns list of {'text': str, 'page': int, 'block': int, 'heading': str or None}"""
    doc = fitz.open(pdf_path)
    results = []
    for pageno in range(doc.page_count):
        page = doc.load_page(pageno)
        blocks = page.get_text("dict")["blocks"]
        for b_idx, block in enumerate(blocks):
            # block may contain lines; join them
            lines = []
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    lines.append(span.get("text", ""))
            text = " ".join(lines).strip()
            if not text:
                continue
            # Simple heuristic detect heading (all caps or ends with ':')
            heading = None
            if len(text) < 120 and (text.isupper() or text.endswith(":")):
                heading = text
            results.append({"text": text, "page": pageno+1, "block": b_idx, "heading": heading})
    return results

def extract_docx_with_structure(docx_path: str):
    doc = docx.Document(docx_path)
    results = []
    for i, para in enumerate(doc.paragraphs):
        text = para.text.strip()
        if not text:
            continue
        # heading detection via style or bold
        style = para.style.name.lower() if para.style else ""
        heading = text if "heading" in style else None
        results.append({"text": text, "para_idx": i, "heading": heading})
    return results

def extract_email(msg_path: str):
    m = mailparser.parse_from_file(msg_path)
    body = m.body or ""
    # optionally parse attachments separately
    return [{"text": body, "from": m.from_, "to": m.to, "subject": m.subject}]


### Clause Aware Sematic Searching

In [10]:
import re
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

CLAUSE_RE = re.compile(r'^\s*\d+(\.\d+){0,}\s+')  # detects "1.", "2.1", etc.

def semantic_chunker(structured_parts, max_tokens=350, overlap_tokens=50):
    """
    structured_parts: list of dicts from extract_pdf_with_structure
    returns list of chunks: dicts {text, metadata}
    """
    chunks = []
    buffer = ""
    buffer_meta = {"pages": set(), "headings": [], "source_blocks": []}

    def flush_buffer():
        nonlocal buffer, buffer_meta
        if not buffer.strip(): 
            return
        chunks.append({
            "text": buffer.strip(),
            "pages": sorted(buffer_meta["pages"]),
            "headings": buffer_meta["headings"],
            "sources": buffer_meta["source_blocks"]
        })
        # set overlap: keep last `overlap_tokens` tokens as new buffer
        toks = tokenizer.encode(buffer)
        overlap_toks = toks[-overlap_tokens:] if len(toks) > overlap_tokens else toks
        buffer = tokenizer.decode(overlap_toks) if overlap_toks else ""
        buffer_meta = {"pages": set(), "headings": [], "source_blocks": []}

    for part in structured_parts:
        text = part["text"]
        # if heading or clause start: prefer flush (start new chunk)
        if part.get("heading") or CLAUSE_RE.match(text):
            # flush current chunk if not empty
            if buffer.strip():
                flush_buffer()
            # start new chunk with the heading/clause
            buffer += (text + "\n")
            buffer_meta["pages"].add(part.get("page", part.get("para_idx", 0)))
            buffer_meta["headings"].append(part.get("heading") or "")
            buffer_meta["source_blocks"].append((part.get("page",0), part.get("block",0)))
            # if this too large, flush
            if len(tokenizer.encode(buffer)) > max_tokens:
                flush_buffer()
            continue

        # otherwise append
        buffer += (" " + text)
        buffer_meta["pages"].add(part.get("page", part.get("para_idx", 0)))
        buffer_meta["source_blocks"].append((part.get("page",0), part.get("block",0)))
        if len(tokenizer.encode(buffer)) > max_tokens:
            flush_buffer()

    if buffer.strip():
        flush_buffer()
    return chunks


  from .autonotebook import tqdm as notebook_tqdm


### D — Embeddings: high-quality & normalization

In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("BAAI/bge-large-en-v1.5")  # or your choice

def embed_texts(texts: List[str]):
    embs = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    # L2-normalize for cosine in FAISS
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    embs = embs / norms
    return embs


KeyboardInterrupt: 

### E — Vector DB (FAISS) with metadata mapping

In [None]:
import faiss
import json

def build_faiss_index(embs: np.ndarray, ids: List[int], ef_construction=200, M=64):
    d = embs.shape[1]
    # HNSW index
    index = faiss.IndexHNSWFlat(d, M)
    index.hnsw.efConstruction = ef_construction
    # wrap in ID map so we can assign stable ids
    id_index = faiss.IndexIDMap(index)
    id_index.add_with_ids(embs, np.array(ids, dtype='int64'))
    return id_index


In [None]:
# metadata store
metadata_store = {}  # id -> {text, pages, headings, source}
# when constructing:
for i, chunk in enumerate(chunks):
    id_ = i+1
    metadata_store[id_] = chunk
# save metadata_store to disk
with open("meta.json", "w") as f:
    json.dump(metadata_store, f, ensure_ascii=False, indent=2)


### F — Hybrid retrieval: BM25 + Embeddings + union + rerank

#### 1) BM25 (exact-match for clauses & numbers)

In [None]:
from rank_bm25 import BM25Okapi
tokenized_corpus = [tokenizer.tokenize(c['text']) for c in chunks]
bm25 = BM25Okapi(tokenized_corpus)

def bm25_search(query, top_k=10):
    tokenized_q = tokenizer.tokenize(query)
    scores = bm25.get_scores(tokenized_q)
    top_idxs = np.argsort(scores)[::-1][:top_k]
    return list(top_idxs), scores[top_idxs]


#### 2) Embedding search

In [None]:
def embed_search(query, faiss_index, top_k=10):
    q_emb = embed_texts([query])  # normalized
    D, I = faiss_index.search(q_emb.astype('float32'), top_k)
    return I[0].tolist(), D[0].tolist()


### 3) Combine results and rerank using CrossEncoder

In [None]:
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # good speed/quality

def hybrid_retrieve(query, top_k=8):
    bm25_ids, _ = bm25_search(query, top_k=top_k*2)
    emb_ids, _ = embed_search(query, faiss_index, top_k=top_k*2)
    candidate_ids = list(dict.fromkeys(bm25_ids + emb_ids))  # keep order, unique
    candidate_texts = [metadata_store[int(cid)+1]['text'] for cid in candidate_ids]  # adapt id offset

    # rerank
    pairs = [(query, t) for t in candidate_texts]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(candidate_ids, candidate_texts, scores), key=lambda x: x[2], reverse=True)
    top = ranked[:top_k]
    return top  # list of (id, text, score)


### Answer Generation

In [None]:
def generate_answer(query, top_chunks, llm_call_fn):
    """
    top_chunks: list of tuples (id, text, rerank_score)
    llm_call_fn: function that accepts prompt and returns answer & score
    """
    # build prompt with sources and instructions
    sources_text = "\n\n".join([f"[Source {i+1} | id={cid} | score={round(s,3)}]\n{txt}" 
                                for i, (cid, txt, s) in enumerate(top_chunks)])
    prompt = f"""
You are a policy/contract assistant. Use ONLY the information in the provided sources to answer the query.
If answer is not in sources, say "Not found in documents".
Provide:
1) Short answer (1-3 sentences)
2) Supporting quotes with source ids and pages
3) Explanation of why these sources match (2-3 lines)
4) Confidence (0-1)

SOURCES:
{sources_text}

QUERY: {query}

Answer in JSON only with keys: answer, evidence, explanation, confidence
"""
    response = llm_call_fn(prompt)  # implement with OpenAI/LLM of choice
    # parse response if LLM already returns JSON, else parse
    return response


# Final Pipeline

In [None]:
# 1. Ingest -> structured_parts
structured = extract_pdf_with_structure("policy.pdf")    # or docx/email functions

# 2. Chunk
chunks = semantic_chunker(structured)

# 3. Embed and build FAISS
texts = [c["text"] for c in chunks]
embs = embed_texts(texts).astype('float32')
ids = list(range(1, len(texts)+1))
faiss_index = build_faiss_index(embs, ids)
# store metadata_store as id->chunk

# 4. Build BM25
tokenized_corpus = [tokenizer.tokenize(t) for t in texts]
bm25 = BM25Okapi(tokenized_corpus)

# 5. Query -> hybrid_retrieve -> top_chunks
top_chunks = hybrid_retrieve("What are the objectives of the mission?", top_k=5)

# 6. Generate answer
resp_json = generate_answer("What are the objectives of the mission?", top_chunks, llm_call_fn)

print(resp_json)