In [3]:
pip install chromadb faiss-cpu sentence-transformers langchain



In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import chromadb
import numpy as np
import time


In [6]:
queries = [
    "What is the stock price of Apple?",
    "Show me my portfolio performance",
    "When is my next SIP due?",
    "What is the inflation rate in India?",
    "How can I reduce my credit card debt?",
    "List my top 5 performing mutual funds",
    "Predict Tesla stock for next week",
    "What are my tax saving options?",
    "Give me my EPF balance",
    "Compare Nifty and Sensex performance"
]

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
dim = embeddings.shape[1]


In [7]:
# Create FAISS index
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embeddings)

def faiss_search(query, k=3):
    start = time.time()
    q_emb = model.encode([query], normalize_embeddings=True)
    distances, indices = faiss_index.search(q_emb, k)
    elapsed = time.time() - start
    return [queries[i] for i in indices[0]], elapsed

results, time_taken = faiss_search("tax saving options")
print("FAISS Results:", results)
print("FAISS Search Time:", time_taken, "seconds")


FAISS Results: ['What are my tax saving options?', 'List my top 5 performing mutual funds', 'How can I reduce my credit card debt?']
FAISS Search Time: 0.016526460647583008 seconds


In [8]:
client = chromadb.Client()
collection = client.create_collection(name="financial_queries")

# Insert documents
for i, q in enumerate(queries):
    collection.add(documents=[q], embeddings=[embeddings[i]], ids=[str(i)])

def chroma_search(query, k=3):
    start = time.time()
    q_emb = model.encode([query], normalize_embeddings=True)
    results = collection.query(query_embeddings=q_emb.tolist(), n_results=k)
    elapsed = time.time() - start
    return results["documents"][0], elapsed

results, time_taken = chroma_search("tax saving options")
print("ChromaDB Results:", results)
print("ChromaDB Search Time:", time_taken, "seconds")


ChromaDB Results: ['What are my tax saving options?', 'List my top 5 performing mutual funds', 'How can I reduce my credit card debt?']
ChromaDB Search Time: 0.08368277549743652 seconds


In [9]:
test_query = "best mutual funds for investment"
faiss_res, faiss_time = faiss_search(test_query)
chroma_res, chroma_time = chroma_search(test_query)

print("\n🔍 Comparison for Query:", test_query)
print("FAISS:", faiss_res, "| Time:", round(faiss_time*1000, 3), "ms")
print("ChromaDB:", chroma_res, "| Time:", round(chroma_time*1000, 3), "ms")



🔍 Comparison for Query: best mutual funds for investment
FAISS: ['List my top 5 performing mutual funds', 'Show me my portfolio performance', 'What are my tax saving options?'] | Time: 20.694 ms
ChromaDB: ['List my top 5 performing mutual funds', 'Show me my portfolio performance', 'What are my tax saving options?'] | Time: 16.456 ms


In [10]:
import time
import numpy as np
import pandas as pd
import psutil
from sentence_transformers import SentenceTransformer
import faiss
import chromadb
from chromadb.utils import embedding_functions
from tabulate import tabulate

In [16]:
import random
import json
import re

# -------------------------
# 1. Define finance topics and phrases
# -------------------------
user_questions = [
    "What is the current NAV of {}?",
    "Should I continue my SIP in {}?",
    "How is my portfolio performing?",
    "Give me investment advice for {}.",
    "What is the risk associated with {}?",
    "How much tax will I pay on my {} gains?",
    "Can you suggest some good mutual funds?",
    "What's the trend for {} this month?",
    "Is it a good time to buy {}?",
    "How has {} performed historically?"
]

assistant_templates = [
    "The NAV for {} as of today is ₹{:.2f}.",
    "Based on your long-term goals, continuing the SIP in {} seems reasonable.",
    "{} is currently showing a {} trend.",
    "The risk for {} is considered {}.",
    "You may pay around ₹{:.2f} tax on your {} gains this year.",
    "Based on historical performance, {} could be a suitable choice.",
    "Your portfolio has gained/lost {:.2f}% over the past month.",
    "Considering market conditions, investing in {} now seems {}.",
    "I recommend diversifying your investment across equities, bonds, and funds.",
    "The historical returns of {} over the last 5 years average {:.2f}% per annum."
]

funds = ["HDFC Balanced Advantage Fund", "ICICI Prudential Bluechip Fund",
         "SBI Small Cap Fund", "Axis Long Term Equity Fund", "Nifty 50 ETF"]

risks = ["low", "moderate", "high"]
trends = ["upward", "downward", "stable"]
time_periods = ["month", "quarter", "year"]

# -------------------------
# 2. Generate synthetic dataset
# -------------------------
n_conversations = 2000  # each conversation will have multiple turns
synthetic_data = []

for conv_id in range(n_conversations):
    n_turns = random.randint(2, 6)  # 2–6 turns per conversation
    for turn in range(n_turns):
        # Alternate roles: user → assistant
        if turn % 2 == 0:
            q_template = random.choice(user_questions)
            fund_choice = random.choice(funds)
            question = q_template.format(fund_choice)
            synthetic_data.append({
                "role": "user",
                "text": question
            })
        else:
            a_template = random.choice(assistant_templates)
            fund_choice = random.choice(funds)
            trend = random.choice(trends)
            risk = random.choice(risks)
            tax = random.uniform(1000, 5000)
            nav = random.uniform(50, 500)
            gain = random.uniform(-10, 15)

            # Corrected formatting logic
            args = []
            placeholders = re.findall(r'\{\:?\.?\d*f?\}', a_template)

            for placeholder in placeholders:
                if placeholder == "{}":
                    args.append(random.choice([fund_choice, trend, risk]))
                elif "{:.2f}" in placeholder:
                    args.append(random.choice([nav, gain, tax]))
                else:
                    # Handle other potential placeholders if needed
                    pass

            a_text = a_template.format(*args)

            synthetic_data.append({
                "role": "assistant",
                "text": a_text
            })

# -------------------------
# 3. Save dataset to JSONL
# -------------------------
with open("synthetic_finance_chatbot_dataset.jsonl", "w") as f:
    for entry in synthetic_data:
        f.write(json.dumps(entry) + "\n")

print(f"Synthetic dataset generated with {len(synthetic_data)} turns.")

Synthetic dataset generated with 8071 turns.


In [18]:
import json
import random

# Load synthetic dataset
dataset_file = "synthetic_finance_chatbot_dataset.jsonl"
docs = []
with open(dataset_file, "r") as f:
    for i, line in enumerate(f):
        entry = json.loads(line)
        docs.append({
            "id": str(i),
            "role": entry["role"],
            "text": entry["text"]
        })

print(f"Total turns/documents: {len(docs)}")

# Prepare lists for embeddings & FAISS/Chroma
doc_texts = [d["text"] for d in docs]
doc_ids = [d["id"] for d in docs]


Total turns/documents: 8071


In [19]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(doc_texts, convert_to_numpy=True, show_progress_bar=True)
dim = embeddings.shape[1]


Batches:   0%|          | 0/253 [00:00<?, ?it/s]

In [20]:
user_queries = [
    "What is the current NAV of HDFC Balanced Advantage Fund?",
    "Should I continue my SIP in ICICI Prudential Bluechip Fund?",
    "How is my portfolio performing this month?",
    "Give me investment advice for SBI Small Cap Fund.",
    "What tax will I pay on my gains?"
]

query_embeds = model.encode(user_queries, convert_to_numpy=True).astype("float32")


In [21]:
# Compute brute-force top-5 indices for each query
baseline = [np.argsort(-np.dot(embeddings, q))[:5].tolist() for q in query_embeds]

def recall_at_k(hits, baseline, k=5):
    correct = sum(len(set(h[:k]) & set(b[:k])) for h, b in zip(hits, baseline))
    return round(correct / (len(hits) * k), 4)


In [22]:
import faiss
import time
import psutil

def get_mem_mb(): return psutil.Process().memory_info().rss / 1024 / 1024

start_mem = get_mem_mb()
t0 = time.time()

faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embeddings.astype("float32"))

faiss_insert = time.time() - t0
faiss_mem = get_mem_mb() - start_mem

# Search
t1 = time.time()
_, faiss_res = faiss_index.search(query_embeds, k=5)
faiss_search = time.time() - t1
faiss_recall = recall_at_k(faiss_res.tolist(), baseline)


In [25]:
import chromadb
from chromadb.utils import embedding_functions
import math

client = chromadb.Client()

# Delete the collection if it already exists
try:
    client.delete_collection(name="finance_chatbot")
    print("Deleted existing collection 'finance_chatbot'")
except:
    print("Collection 'finance_chatbot' did not exist, creating new one.")

collection = client.create_collection(name="finance_chatbot")

start_mem = get_mem_mb()
t0 = time.time()

# Determine the maximum batch size
max_batch_size = 5461 # The error message indicates this limit

# Calculate the number of batches
num_batches = math.ceil(len(doc_ids) / max_batch_size)

# Add documents in batches
for i in range(num_batches):
    start_index = i * max_batch_size
    end_index = min((i + 1) * max_batch_size, len(doc_ids))

    batch_embeddings = embeddings[start_index:end_index].tolist()
    batch_documents = doc_texts[start_index:end_index]
    batch_ids = doc_ids[start_index:end_index]

    collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        ids=batch_ids
    )


chroma_insert = time.time() - t0
chroma_mem = get_mem_mb() - start_mem

# Search
t1 = time.time()
results = collection.query(query_embeddings=query_embeds.tolist(), n_results=5)
chroma_search = time.time() - t1

# Parse retrieved IDs
chroma_res = [[int(i) for i in r] for r in results["ids"]]
chroma_recall = recall_at_k(chroma_res, baseline)

Deleted existing collection 'finance_chatbot'


In [26]:
import pandas as pd
from tabulate import tabulate

df = pd.DataFrame([
    ["FAISS", round(faiss_insert,4), round(faiss_search,4), round(faiss_mem,2), faiss_recall],
    ["ChromaDB", round(chroma_insert,4), round(chroma_search,4), round(chroma_mem,2), chroma_recall]
], columns=["System", "Insert Time (s)", "Search Time (s)", "Memory (MB)", "Recall@5"])

print(tabulate(df, headers="keys", tablefmt="fancy_grid"))


╒════╤══════════╤═══════════════════╤═══════════════════╤═══════════════╤════════════╕
│    │ System   │   Insert Time (s) │   Search Time (s) │   Memory (MB) │   Recall@5 │
╞════╪══════════╪═══════════════════╪═══════════════════╪═══════════════╪════════════╡
│  0 │ FAISS    │            0.0332 │            0.0055 │         23.34 │       0.04 │
├────┼──────────┼───────────────────┼───────────────────┼───────────────┼────────────┤
│  1 │ ChromaDB │            6.6706 │            0.0058 │        109.79 │       0    │
╘════╧══════════╧═══════════════════╧═══════════════════╧═══════════════╧════════════╛


In [27]:
context_docs = []
context_texts = []

# We'll build documents as: previous 1 turn context + current assistant response
for i in range(len(docs)):
    if docs[i]["role"] == "assistant":
        # Take previous user turn as context
        prev_context = ""
        if i > 0 and docs[i-1]["role"] == "user":
            prev_context = docs[i-1]["text"]
        combined_text = prev_context + " " + docs[i]["text"]
        context_docs.append(docs[i]["id"])
        context_texts.append(combined_text)


In [28]:
context_docs = []
context_texts = []

# We'll build documents as: previous 1 turn context + current assistant response
for i in range(len(docs)):
    if docs[i]["role"] == "assistant":
        # Take previous user turn as context
        prev_context = ""
        if i > 0 and docs[i-1]["role"] == "user":
            prev_context = docs[i-1]["text"]
        combined_text = prev_context + " " + docs[i]["text"]
        context_docs.append(docs[i]["id"])
        context_texts.append(combined_text)


In [29]:
context_embeddings = model.encode(context_texts, convert_to_numpy=True, show_progress_bar=True)
dim = context_embeddings.shape[1]


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

In [30]:
user_queries = [
    "What is the current NAV of HDFC Balanced Advantage Fund?",
    "Should I continue my SIP in ICICI Prudential Bluechip Fund?",
    "How is my portfolio performing this month?",
    "Give me investment advice for SBI Small Cap Fund.",
    "What tax will I pay on my gains?"
]

query_embeds = model.encode(user_queries, convert_to_numpy=True).astype("float32")


In [31]:
baseline = [np.argsort(-np.dot(context_embeddings, q))[:5].tolist() for q in query_embeds]

def recall_at_k(hits, baseline, k=5):
    correct = sum(len(set(h[:k]) & set(b[:k])) for h, b in zip(hits, baseline))
    return round(correct / (len(hits) * k), 4)


In [32]:
import faiss
import psutil
import time

def get_mem_mb(): return psutil.Process().memory_info().rss / 1024 / 1024

start_mem = get_mem_mb()
t0 = time.time()

faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(context_embeddings.astype("float32"))

faiss_insert = time.time() - t0
faiss_mem = get_mem_mb() - start_mem

# Search
t1 = time.time()
_, faiss_res = faiss_index.search(query_embeds, k=5)
faiss_search = time.time() - t1
faiss_recall = recall_at_k(faiss_res.tolist(), baseline)


In [33]:
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="finance_chatbot_context")

start_mem = get_mem_mb()
t0 = time.time()

collection.add(
    embeddings=context_embeddings.tolist(),
    documents=context_texts,
    ids=context_docs
)

chroma_insert = time.time() - t0
chroma_mem = get_mem_mb() - start_mem

# Search
t1 = time.time()
results = collection.query(query_embeddings=query_embeds.tolist(), n_results=5)
chroma_search = time.time() - t1

# Parse retrieved IDs
chroma_res = [[int(i) for i in r] for r in results["ids"]]
chroma_recall = recall_at_k(chroma_res, baseline)


In [34]:
import pandas as pd
from tabulate import tabulate

df = pd.DataFrame([
    ["FAISS", round(faiss_insert,4), round(faiss_search,4), round(faiss_mem,2), faiss_recall],
    ["ChromaDB", round(chroma_insert,4), round(chroma_search,4), round(chroma_mem,2), chroma_recall]
], columns=["System", "Insert Time (s)", "Search Time (s)", "Memory (MB)", "Recall@5"])

print(tabulate(df, headers="keys", tablefmt="fancy_grid"))


╒════╤══════════╤═══════════════════╤═══════════════════╤═══════════════╤════════════╕
│    │ System   │   Insert Time (s) │   Search Time (s) │   Memory (MB) │   Recall@5 │
╞════╪══════════╪═══════════════════╪═══════════════════╪═══════════════╪════════════╡
│  0 │ FAISS    │            0.0107 │            0.0024 │         10.24 │       0.92 │
├────┼──────────┼───────────────────┼───────────────────┼───────────────┼────────────┤
│  1 │ ChromaDB │            2.9617 │            0.007  │         15.59 │       0    │
╘════╧══════════╧═══════════════════╧═══════════════════╧═══════════════╧════════════╛
