In [6]:
# @title Block 1: Unified Pipeline (Text + Tables)
# ======================================================
# 1. INSTALL & SETUP
# ======================================================
!pip install -q -U langchain-qdrant langchain-community qdrant-client sentence-transformers transformers accelerate bitsandbytes pandas

import os
import torch
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline

# ======================================================
# 2. LOAD AI MODELS
# ======================================================
print("\nLoading Embedding Model (BGE-M3)...")
# BGE-M3 is excellent for mixed content (Text + Structured)
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={'normalize_embeddings': True},
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)

print("Loading LLM (Qwen2.5-1.5B)...")
# Lightweight model for describing table rows
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    temperature=0.1,
    do_sample=True,
    return_full_text=False
)
llm = HuggingFacePipeline(pipeline=text_pipeline)

# ======================================================
# 3. UNIFIED PARSER (HANDLES BOTH TEXT & TABLES)
# ======================================================
def parse_markdown_unified(md_text):
    """
    Scans the markdown file.
    - If it sees a Header (#), it updates the 'Current Section'.
    - If it sees a Table (|...|), it captures it as tabular data.
    - If it sees text, it captures it as a text block.
    """
    lines = md_text.split('\n')
    parsed_items = []

    current_section = "General Document"
    current_headers = []
    inside_table = False

    for line in lines:
        stripped = line.strip()
        if not stripped: continue

        # --- A. Header Detection ---
        if stripped.startswith("#"):
            current_section = stripped.lstrip("#").strip()
            inside_table = False
            continue

        # --- B. Table Row Detection ---
        if "|" in stripped and "---" not in stripped:
            cells = [c.strip() for c in stripped.split('|') if c.strip()]

            if not inside_table:
                # First row found -> These are headers
                inside_table = True
                current_headers = cells
            elif len(cells) == len(current_headers):
                # Data row found -> Map to headers
                row_dict = dict(zip(current_headers, cells))
                parsed_items.append({
                    "type": "table_row",
                    "section": current_section,
                    "data": row_dict
                })

        # --- C. Regular Paragraph Detection ---
        elif not inside_table:
            # It's not a header, not a table... it's content.
            parsed_items.append({
                "type": "text_block",
                "section": current_section,
                "text_content": stripped
            })

    return parsed_items

def generate_table_row_description(section, row_dict):
    """
    Uses LLM to convert a data row into a descriptive sentence.
    """
    data_str = ", ".join([f"{k}: {v}" for k, v in row_dict.items()])
    prompt = f"""<|im_start|>system
Convert this tabular data into a single, concise English sentence.
Context: {section}
Data: {data_str}<|im_end|>
<|im_start|>user
Write the sentence.<|im_end|>
<|im_start|>assistant
"""
    try: return llm.invoke(prompt).strip()
    except: return f"In {section}, the details are: {data_str}"

# ======================================================
# 4. EXECUTION & INDEXING
# ======================================================

# --- [STEP 1] LOAD YOUR MARKDOWN FILE ---
# REPLACE THIS PATH with your uploaded file path
file_path = "/content/budget_2025-2026.md"

# (Or use this dummy string for a quick test if no file is ready)
if not os.path.exists(file_path):
    print("File not found. Using DUMMY data for demonstration...")
    md_content = """
# Customs Duty Changes
## Chemical Sector
| Commodity | Old Rate | New Rate |
|---|---|---|
| Sorbitol | 30% | 20% |
| Pyrimidine Compounds | 10% | 7.5% |

# Rural Development
The government announces a new scheme for rural roads.
This will connect 500 new villages by 2026.
    """
else:
    with open(file_path, "r", encoding="utf-8") as f:
        md_content = f.read()

# --- [STEP 2] PROCESS DATA ---
print(f"\n--- Parsing Document ---")
items = parse_markdown_unified(md_content)
documents = []
metadatas = []

print(f"Generating Embeddings (LLM for Tables, Context-Injection for Text)...")

for entry in items:
    # STRATEGY A: TABULAR DATA -> LLM
    if entry['type'] == 'table_row':
        dense_text = generate_table_row_description(entry['section'], entry['data'])

        meta = entry['data'].copy() # Store raw values for filtering
        meta['section'] = entry['section']
        meta['type'] = 'table'

        documents.append(dense_text)
        metadatas.append(meta)
        print(f"   [Table]: {dense_text}")

    # STRATEGY B: TEXT DATA -> CONTEXT INJECTION
    elif entry['type'] == 'text_block':
        # "Orphan" text gets the section header attached
        dense_text = f"Context: {entry['section']}. Content: {entry['text_content']}"

        meta = {
            "section": entry['section'],
            "type": "text",
            "original_text": entry['text_content']
        }

        documents.append(dense_text)
        metadatas.append(meta)
        print(f"   [Text ]: {dense_text[:60]}...")

# --- [STEP 3] INDEX INTO QDRANT ---
print("\n--- Indexing into Qdrant ---")
client = QdrantClient(":memory:")
collection_name = "budget_unified_db"

if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

client.create_collection(
    collection_name,
    vectors_config=models.VectorParams(size=1024, distance=models.Distance.COSINE)
)

vector_store = QdrantVectorStore(client=client, collection_name=collection_name, embedding=hf_embeddings)
vector_store.add_texts(documents, metadatas=metadatas)
print("Done! Database Ready.")


Loading Embedding Model (BGE-M3)...
Loading LLM (Qwen2.5-1.5B)...


Device set to use cuda:0



--- Parsing Document ---
Generating Embeddings (LLM for Tables, Context-Injection for Text)...
   [Text ]: Context: General Document. Content: <!-- image -->...
   [Text ]: Context: SPEECH OF NIRMALA SITHARAMAN MINISTER OF FINANCE. C...
   [Text ]: Context: February 1, 2025. Content: Hon'ble Speaker, I prese...
   [Text ]: Context: Introduction. Content: 1. This Budget continues our...
   [Text ]: Context: Introduction. Content: - a) accelerate growth,...
   [Text ]: Context: Introduction. Content: - b) secure inclusive develo...
   [Text ]: Context: Introduction. Content: - c) invigorate private sect...
   [Text ]: Context: Introduction. Content: - d) uplift household sentim...
   [Text ]: Context: Introduction. Content: - e) enhance spending power ...
   [Text ]: Context: Introduction. Content: 2. Together, we embark on a ...
   [Text ]: Context: Introduction. Content: 3. As  we  complete  the  fi...
   [Text ]: Context: Budget Theme. Content: 4. Our economy is the fastes...
   [Tex

In [7]:
# @title Block 2: Search Interface
# ======================================================
# SEARCH THE UNIFIED DATABASE
# ======================================================

print("üîé INTERFACE")
print("Type 'exit' to quit.\n")

while True:
    query = input("\nEnter Query: ")
    if query.lower() in ['exit', 'quit']: break

    # Search
    results = vector_store.similarity_search(query, k=3)

    print("\n--- Top Results ---")
    for i, res in enumerate(results):
        print(f"\nResult {i+1} ({res.metadata.get('type', 'unknown').upper()}):")
        print(f"   üìù Content: \"{res.page_content}\"")

        # If it's a table, show the raw values nicely
        if res.metadata.get('type') == 'table':
             print(f"   üìä Raw Data: {res.metadata}")

üîé INTERFACE
Type 'exit' to quit.


Enter Query: transfer of capital assets

--- Top Results ---

Result 1 (TEXT):
   üìù Content: "Context: 7. Amendment of definition of 'capital asset':. Content: - In order to bring clarity on the chargeability of income arising out of transfer of capital asset being securities held by an investment fund as referred to in section 115UB of the Act, the definition of capital asset is proposed to be amended."

Result 2 (TEXT):
   üìù Content: "Context: 10. Amendment of definition of 'capital asset':. Content: - In order to bring clarity on the chargeability of income arising out of transfer of capital asset being securities held by an investment fund as referred to in section 115UB of the Act, the definition of capital asset is proposed to be amended."

Result 3 (TEXT):
   üìù Content: "Context: 2. Parity in r ates of long term capital gain on transfer of securities by nonresident:. Content: - It is proposed to bring parity between the taxation of 

KeyboardInterrupt: Interrupted by user