In [None]:
# ======================
# 1. Install Dependencies
# ======================
!pip install transformers

import faiss
import numpy as np
import time
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Summarizer model (for compression)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# ======================
# 2. Example Financial Data
# ======================
raw_docs = [
    "User monthly income: ₹80000",
    "User spends ₹20000 on rent",
    "User spends ₹10000 on food",
    "User spends ₹3000 on transport",
    "Investment goal: ₹5 lakh in 2 years",
    "Mutual fund portfolio invested in SBI and HDFC funds",
    "FD maturity in December worth ₹1 lakh",
    "Paid ₹500 for groceries on Aug 1",
    "Paid ₹600 for groceries on Aug 8",
    "Paid ₹550 for groceries on Aug 15",
    "Savings at end of August: ₹30000"
]

queries = [
    "What is my investment goal?",
    "How much rent do I pay?",
    "What is my savings balance?"
]

expected_answers = [
    "Investment goal: ₹5 lakh in 2 years",
    "User spends ₹20000 on rent",
    "Savings at end of August: ₹30000"
]

# ======================
# 3. Utility Functions
# ======================
def build_faiss_index(docs, model):
    embeddings = model.encode(docs)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

def evaluate(index, docset, queries, expected_answers, model):
    correct, start = 0, time.time()
    for q, expected in zip(queries, expected_answers):
        q_vec = model.encode([q])
        D, I = index.search(np.array(q_vec), k=1)
        retrieved = docset[I[0][0]]
        print(f"Query: {q}\nRetrieved: {retrieved}\nExpected: {expected}\n")
        if expected in retrieved:
            correct += 1
    return {
        "accuracy": correct / len(queries),
        "size": index.ntotal,
        "query_time": time.time() - start
    }

# ======================
# 4. Write Context
# ======================
print("=== WRITE CONTEXT ===")
write_index = build_faiss_index(raw_docs, embed_model)
write_metrics = evaluate(write_index, raw_docs, queries, expected_answers, embed_model)

# ======================
# 5. Select Context
# (same as Write, but conceptually it's about retrieval)
# ======================
print("=== SELECT CONTEXT ===")
select_index = build_faiss_index(raw_docs, embed_model)
select_metrics = evaluate(select_index, raw_docs, queries, expected_answers, embed_model)

# ======================
# 6. Compress Context (LLM-based summarization + rule-based aggregation)
# ======================
print("=== COMPRESS CONTEXT ===")

# Summarize long text data
long_text = " ".join([d for d in raw_docs if "Paid" in d])  # groceries transactions
summary = summarizer(long_text, max_length=40, min_length=15, do_sample=False)[0]['summary_text']

compressed_docs = [
    "Income: ₹80000/month, Rent: ₹20000, Food: ₹10000, Transport: ₹3000",
    "Investment goal: ₹5 lakh in 2 years",
    f"Groceries summary: {summary}",
    "Savings at end of August: ₹30000",
    "Investments: SBI & HDFC mutual funds, FD ₹1 lakh Dec maturity"
]

compress_index = build_faiss_index(compressed_docs, embed_model)
compress_metrics = evaluate(compress_index, compressed_docs, queries, expected_answers, embed_model)

# ======================
# 7. Isolate Context (Partition by domain)
# ======================
print("=== ISOLATE CONTEXT ===")

budget_docs = [d for d in raw_docs if any(k in d.lower() for k in ["income", "spend", "rent", "grocer", "saving", "transport"])]
investment_docs = [d for d in raw_docs if any(k in d.lower() for k in ["investment", "mutual fund", "fd", "portfolio"])]

budget_index = build_faiss_index(budget_docs, embed_model)
investment_index = build_faiss_index(investment_docs, embed_model)

# Evaluate isolate manually
correct, start = 0, time.time()
for q, expected in zip(queries, expected_answers):
    if any(k in q.lower() for k in ["rent", "income", "saving", "spend", "grocer", "transport"]):
        idx, docset = budget_index, budget_docs
    else:
        idx, docset = investment_index, investment_docs
    q_vec = embed_model.encode([q])
    D, I = idx.search(np.array(q_vec), k=1)
    retrieved = docset[I[0][0]]
    print(f"Query: {q}\nRetrieved: {retrieved}\nExpected: {expected}\n")
    if expected in retrieved:
        correct += 1

isolate_metrics = {
    "accuracy": correct / len(queries),
    "size": budget_index.ntotal + investment_index.ntotal,
    "query_time": time.time() - start
}

# ======================
# 8. Final Comparison
# ======================
print("=== RESULTS COMPARISON ===")
print("Write Context:", write_metrics)
print("Select Context:", select_metrics)
print("Compress Context:", compress_metrics)
print("Isolate Context:", isolate_metrics)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


=== WRITE CONTEXT ===


Your max_length is set to 40, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


Query: What is my investment goal?
Retrieved: Investment goal: ₹5 lakh in 2 years
Expected: Investment goal: ₹5 lakh in 2 years

Query: How much rent do I pay?
Retrieved: User spends ₹20000 on rent
Expected: User spends ₹20000 on rent

Query: What is my savings balance?
Retrieved: Savings at end of August: ₹30000
Expected: Savings at end of August: ₹30000

=== SELECT CONTEXT ===
Query: What is my investment goal?
Retrieved: Investment goal: ₹5 lakh in 2 years
Expected: Investment goal: ₹5 lakh in 2 years

Query: How much rent do I pay?
Retrieved: User spends ₹20000 on rent
Expected: User spends ₹20000 on rent

Query: What is my savings balance?
Retrieved: Savings at end of August: ₹30000
Expected: Savings at end of August: ₹30000

=== COMPRESS CONTEXT ===
Query: What is my investment goal?
Retrieved: Investment goal: ₹5 lakh in 2 years
Expected: Investment goal: ₹5 lakh in 2 years

Query: How much rent do I pay?
Retrieved: Income: ₹80000/month, Rent: ₹20000, Food: ₹10000, Transport: ₹3

In [None]:
# ======================
# 1. Install dependencies
# ======================
import faiss, numpy as np, time, random
from sentence_transformers import SentenceTransformer

# Embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# ======================
# 2. Generate Large Dummy Dataset (~75k records)
# ======================

# Generate 60k random transactions
transactions = [
    f"Paid ₹{random.randint(100, 5000)} for {random.choice(['groceries','rent','food','transport','utilities'])} on day {i}"
    for i in range(60000)
]

# Generate 10k investment logs
investments = [
    f"Invested ₹{random.randint(1000, 50000)} in {random.choice(['mutual fund','stocks','FD','bonds'])} on day {i}"
    for i in range(10000)
]

# Generate 5k savings updates
savings = [
    f"Savings balance updated to ₹{random.randint(20000, 1000000)} on day {i}"
    for i in range(5000)
]

# Combine full dataset
all_docs = transactions + investments + savings
print("Dataset size:", len(all_docs))

# Queries & Expected patterns (synthetic ground truth)
queries = [
    "How much did I spend on groceries?",
    "What is my savings balance?",
    "What was my last investment?"
]
expected_keywords = ["groceries", "savings balance", "invested"]

# ======================
# 3. Helper Functions
# ======================
def build_faiss_index(docs):
    embeddings = model.encode(docs, batch_size=128, show_progress_bar=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

def evaluate(index, docset, queries, expected_keywords):
    start = time.time()
    correct = 0
    for q, keyword in zip(queries, expected_keywords):
        q_vec = model.encode([q])
        D, I = index.search(np.array(q_vec), k=3)  # top-3 for robustness
        retrieved = [docset[j] for j in I[0]]
        if any(keyword in r.lower() for r in retrieved):
            correct += 1
    return {
        "accuracy": correct / len(queries),
        "size": index.ntotal,
        "query_time": time.time() - start
    }

# ======================
# 4. Write Context (store everything raw)
# ======================
print("=== WRITE CONTEXT ===")
write_index = build_faiss_index(all_docs)
write_metrics = evaluate(write_index, all_docs, queries, expected_keywords)

# ======================
# 5. Select Context (same index but retrieval logic)
# ======================
print("=== SELECT CONTEXT ===")
select_index = build_faiss_index(all_docs)
select_metrics = evaluate(select_index, all_docs, queries, expected_keywords)

# ======================
# 6. Compress Context (aggregate instead of raw transactions)
# ======================
print("=== COMPRESS CONTEXT ===")

# Rule-based compression: aggregate transactions monthly
compressed_transactions = [
    f"Total spent on {cat}: ₹{random.randint(10000,50000)} in August"
    for cat in ["groceries","rent","food","transport","utilities"]
]

compressed_docs = compressed_transactions + [
    "Average savings balance: ₹500000",
    "Latest investment: Mutual fund ₹20000 in SBI"
]

compress_index = build_faiss_index(compressed_docs)
compress_metrics = evaluate(compress_index, compressed_docs, queries, expected_keywords)

# ======================
# 7. Isolate Context (separate indexes)
# ======================
print("=== ISOLATE CONTEXT ===")

budget_docs = transactions + savings
investment_docs = investments

budget_index = build_faiss_index(budget_docs)
investment_index = build_faiss_index(investment_docs)

# Manual evaluation with partition
start = time.time()
correct = 0
for q, keyword in zip(queries, expected_keywords):
    if "savings" in q.lower() or "spend" in q.lower() or "grocer" in q.lower():
        idx, docset = budget_index, budget_docs
    else:
        idx, docset = investment_index, investment_docs
    q_vec = model.encode([q])
    D, I = idx.search(np.array(q_vec), k=3)
    retrieved = [docset[j] for j in I[0]]
    if any(keyword in r.lower() for r in retrieved):
        correct += 1

isolate_metrics = {
    "accuracy": correct / len(queries),
    "size": budget_index.ntotal + investment_index.ntotal,
    "query_time": time.time() - start
}

# ======================
# 8. Final Comparison
# ======================
print("=== RESULTS COMPARISON ===")
print("Write Context:", write_metrics)
print("Select Context:", select_metrics)
print("Compress Context:", compress_metrics)
print("Isolate Context:", isolate_metrics)


Dataset size: 75000
=== WRITE CONTEXT ===


Batches:   0%|          | 0/586 [00:00<?, ?it/s]

=== SELECT CONTEXT ===


Batches:   0%|          | 0/586 [00:00<?, ?it/s]

=== COMPRESS CONTEXT ===


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

=== ISOLATE CONTEXT ===


Batches:   0%|          | 0/508 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

=== RESULTS COMPARISON ===
Write Context: {'accuracy': 1.0, 'size': 75000, 'query_time': 0.07089376449584961}
Select Context: {'accuracy': 1.0, 'size': 75000, 'query_time': 0.05225419998168945}
Compress Context: {'accuracy': 0.6666666666666666, 'size': 7, 'query_time': 0.026262521743774414}
Isolate Context: {'accuracy': 1.0, 'size': 75000, 'query_time': 0.04143953323364258}


In [None]:
import time

def ask_question(query, index, docset, top_k=3):
    start = time.time()
    q_vec = model.encode([query])
    D, I = index.search(np.array(q_vec), k=top_k)
    retrieved = [docset[j] for j in I[0]]
    elapsed = time.time() - start

    print(f"\n❓ Query: {query}")
    print(f"⚡ Retrieval Time: {elapsed:.4f} sec")
    print("📄 Retrieved Docs:")
    for rank, doc in enumerate(retrieved, 1):
        print(f"{rank}. {doc}")
    return retrieved

# Example queries
ask_question("How much did I spend on groceries?", write_index, all_docs)
ask_question("What is my savings balance?", write_index, all_docs)
ask_question("What was my last investment?", write_index, all_docs)



❓ Query: How much did I spend on groceries?
⚡ Retrieval Time: 0.0268 sec
📄 Retrieved Docs:
1. Paid ₹4315 for groceries on day 37508
2. Paid ₹4904 for groceries on day 17806
3. Paid ₹4075 for groceries on day 9975

❓ Query: What is my savings balance?
⚡ Retrieval Time: 0.0176 sec
📄 Retrieved Docs:
1. Savings balance updated to ₹641769 on day 2026
2. Savings balance updated to ₹395869 on day 1506
3. Savings balance updated to ₹461269 on day 360

❓ Query: What was my last investment?
⚡ Retrieval Time: 0.0166 sec
📄 Retrieved Docs:
1. Invested ₹49419 in stocks on day 1508
2. Invested ₹49877 in mutual fund on day 8573
3. Invested ₹49478 in stocks on day 8523


['Invested ₹49419 in stocks on day 1508',
 'Invested ₹49877 in mutual fund on day 8573',
 'Invested ₹49478 in stocks on day 8523']

In [None]:
import time
import numpy as np

def compare_all(query,
                write_index, all_docs,
                compress_index, compressed_docs,
                isolate_indexes, isolate_docs,
                top_k=3):
    results = {}

    # --- WRITE CONTEXT ---
    start = time.time()
    q_vec = model.encode([query])
    D, I = write_index.search(np.array(q_vec), k=top_k)
    elapsed = time.time() - start
    results["WRITE"] = {
        "time": elapsed,
        "retrieved": [all_docs[j] for j in I[0]]
    }

    # --- SELECT CONTEXT ---
    start = time.time()
    filtered_docs = [doc for doc in all_docs if any(word in doc.lower() for word in query.lower().split())]
    if filtered_docs:
        fd_vecs = model.encode(filtered_docs)
        fd_index = faiss.IndexFlatL2(fd_vecs.shape[1])
        fd_index.add(np.array(fd_vecs))
        q_vec = model.encode([query])
        D, I = fd_index.search(np.array(q_vec), k=min(top_k, len(filtered_docs)))
        retrieved = [filtered_docs[j] for j in I[0]]
    else:
        retrieved = []
    elapsed = time.time() - start
    results["SELECT"] = {"time": elapsed, "retrieved": retrieved}

    # --- COMPRESS CONTEXT ---
    start = time.time()
    q_vec = model.encode([query])
    D, I = compress_index.search(np.array(q_vec), k=top_k)
    elapsed = time.time() - start
    results["COMPRESS"] = {
        "time": elapsed,
        "retrieved": [compressed_docs[j] for j in I[0]]
    }

    # --- ISOLATE CONTEXT ---
    isolate_results = {}
    start = time.time()
    for domain, idx in isolate_indexes.items():
        q_vec = model.encode([query])
        D, I = idx.search(np.array(q_vec), k=top_k)
        isolate_results[domain] = [isolate_docs[domain][j] for j in I[0]]
    elapsed = time.time() - start
    results["ISOLATE"] = {"time": elapsed, "retrieved": isolate_results}

    # ---- PRINT RESULTS ----
    print(f"\n❓ Query: {query}")
    for mode, data in results.items():
        print(f"\n=== {mode} CONTEXT ===")
        print(f"⚡ Time: {data['time']:.4f} sec")
        if mode != "ISOLATE":
            for rank, doc in enumerate(data["retrieved"], 1):
                print(f"{rank}. {doc}")
        else:
            for domain, docs in data["retrieved"].items():
                print(f"📂 {domain}:")
                for rank, doc in enumerate(docs, 1):
                    print(f"   {rank}. {doc}")

    return results


# ✅ Example calls (make sure you already built all 4 indexes first):
compare_all("How much rent do I pay?", write_index, all_docs, compress_index, compressed_docs, isolate_indexes, isolate_docs, top_k=2)
compare_all("What is my investment goal?", write_index, all_docs, compress_index, compressed_docs, isolate_indexes, isolate_docs, top_k=2)
compare_all("What is my savings balance?", write_index, all_docs, compress_index, compressed_docs, isolate_indexes, isolate_docs, top_k=2)



❓ Query: How much rent do I pay?

=== WRITE CONTEXT ===
⚡ Time: 0.0276 sec
1. Paid ₹2500 for rent on day 44060
2. Paid ₹395 for rent on day 10020

=== SELECT CONTEXT ===
⚡ Time: 21.9548 sec
1. Paid ₹2500 for rent on day 44060
2. Paid ₹395 for rent on day 10020

=== COMPRESS CONTEXT ===
⚡ Time: 0.0085 sec
1. Total spent on rent: ₹32786 in August
2. Total spent on utilities: ₹35680 in August

=== ISOLATE CONTEXT ===
⚡ Time: 0.0364 sec
📂 budget:
   1. Paid ₹4185 for rent on day 37500
   2. Paid ₹4575 for rent on day 20232
📂 investment:
   1. Invested ₹43401 in bonds on day 2750
   2. Invested ₹40435 in bonds on day 1650

❓ Query: What is my investment goal?

=== WRITE CONTEXT ===
⚡ Time: 0.0249 sec
1. Invested ₹20359 in mutual fund on day 7803
2. Invested ₹37643 in mutual fund on day 8500

=== SELECT CONTEXT ===
⚡ Time: 0.4382 sec

=== COMPRESS CONTEXT ===
⚡ Time: 0.0093 sec
1. Latest investment: Mutual fund ₹20000 in SBI
2. Average savings balance: ₹500000

=== ISOLATE CONTEXT ===
⚡ Tim

{'WRITE': {'time': 0.020083189010620117,
  'retrieved': ['Savings balance updated to ₹436996 on day 2030',
   'Savings balance updated to ₹467776 on day 2037']},
 'SELECT': {'time': 1.9594650268554688,
  'retrieved': ['Savings balance updated to ₹436996 on day 2030',
   'Savings balance updated to ₹467776 on day 2037']},
 'COMPRESS': {'time': 0.007478237152099609,
  'retrieved': ['Average savings balance: ₹500000',
   'Total spent on utilities: ₹35680 in August']},
 'ISOLATE': {'time': 0.023494958877563477,
  'retrieved': {'budget': ['Savings balance updated to ₹641769 on day 2026',
    'Savings balance updated to ₹395869 on day 1506'],
   'investment': ['Invested ₹37974 in bonds on day 4018',
    'Invested ₹5896 in bonds on day 4014']}}}