In [1]:
import os
import requests
from rank_bm25 import BM25Okapi

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

DATASET_PATH = "datasetFED/"
PERSIST_DIR = "./FEDcoma_db"

CHAT_URL = "API"
LLM_MODEL = "QuantTrio/Qwen3-VL-32B-Instruct-AWQ"

CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

TOP_K_DENSE = 6
TOP_K_SPARSE = 6
FINAL_TOP_K = 5

In [11]:
print("Loading documents...")

loader = DirectoryLoader(
    DATASET_PATH,
    glob="*.pdf",
    loader_cls=PyPDFLoader
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

split_docs = text_splitter.split_documents(documents)

print(f"Total chunks created: {len(split_docs)}")

Loading documents...


Multiple definitions in dictionary at byte 0x134c4f for key /Im7


Total chunks created: 41333


In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

if not os.path.exists(PERSIST_DIR):
    vectorstore = Chroma.from_documents(
        documents=split_docs,
        embedding=embedding_model,
        persist_directory=PERSIST_DIR
    )
    vectorstore.persist()
    print("Documents embedded and stored.")
else:
    vectorstore = Chroma(
        persist_directory=PERSIST_DIR,
        embedding_function=embedding_model
    )
    print("Loaded existing Chroma DB.")

retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K_DENSE})


  vectorstore = Chroma(


Loaded existing Chroma DB.


In [12]:
corpus = [doc.page_content for doc in split_docs]
tokenized_corpus = [doc.split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [5]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")



In [6]:
def expand_query_multi(query):

    prompt = f"""
Generate 3 alternative search queries for:

"{query}"

Make them more descriptive and include possible synonyms.
Return each on a new line only.
"""

    payload = {
        "model": LLM_MODEL,
        "messages": [
            {"role": "system", "content": "You improve search queries for document retrieval."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2
    }

    r = requests.post(CHAT_URL, json=payload, timeout=60)
    r.raise_for_status()

    content = r.json()["choices"][0]["message"]["content"]

    queries = [
        line.strip("- ").strip()
        for line in content.split("\n")
        if line.strip()
    ]

    # Include original query
    return list(set([query] + queries))

In [19]:
def hybrid_retrieve_multi(query):

    expanded_queries = expand_query_multi(query)

    # print("\n--- Expanded Queriesn ---")
    # print(expanded_queries)    

    all_docs = []

    for q in expanded_queries:

        # Dense retrieval
        dense_docs = retriever.invoke(q)

        # Sparse retrieval
        tokenized_query = q.split()
        sparse_scores = bm25.get_scores(tokenized_query)

        top_sparse_idx = sorted(
            range(len(sparse_scores)),
            key=lambda i: sparse_scores[i],
            reverse=True
        )[:TOP_K_SPARSE]

        sparse_docs = [split_docs[i] for i in top_sparse_idx]

        all_docs.extend(dense_docs)
        all_docs.extend(sparse_docs)

    # Deduplicate
    unique_docs = list({doc.page_content: doc for doc in all_docs}.values())

    # Rerank
    pairs = [(query, doc.page_content) for doc in unique_docs]
    scores = reranker.predict(pairs)

    ranked = sorted(
        zip(scores, unique_docs),
        reverse=True,
        key=lambda x: x[0]
    )

    return [doc for _, doc in ranked[:FINAL_TOP_K]]

In [9]:
def ask_llm(docs, query):

    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""
You are a document-based assistant.

Rules:
1. Answer ONLY using the provided context.
2. If the topic is discussed indirectly, summarize what the document explains.
3. If the topic is not mentioned at all, reply exactly:
   "Not found in the provided documents."
4. Do NOT add external knowledge.
5. Be clear and concise.

Context:
{context}

Question:
{query}

Answer:
"""

    payload = {
        "model": LLM_MODEL,
        "messages": [
            {"role": "system", "content": "Answer strictly from document context."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2
    }

    r = requests.post(CHAT_URL, json=payload, timeout=120)
    r.raise_for_status()

    return r.json()["choices"][0]["message"]["content"].strip()


In [25]:
print("\nRAG ready. Type 'exit' to quit.")

while True:

    query = input("\nAsk a question: ")

    if query.lower() == "exit":
        break

    print("\n--- Question ---")
    print(query)

    docs = hybrid_retrieve_multi(query)

    # print("\n--- Docs ---")
    # print(docs)

    answer = ask_llm(docs, query)

    print("\nLLM Answer:\n")
    print(answer)
    print("\n")


RAG ready. Type 'exit' to quit.

--- Question ---
Updated Form 1099-K Reporting Thresholds

LLM Answer:

The updated Form 1099-K reporting thresholds require payment card companies, payment apps, and online marketplaces to send a Form 1099-K only if the amount of business transactions during the year meets or exceeds the reporting threshold. The document does not specify the exact dollar threshold, but it indicates that reporting is triggered based on the total amount of business transactions. Additionally, the form must be filed if the number of payment transactions (excluding refunds) processed through the payment card/third-party payer network meets certain criteria. The document also notes that the form and instructions are updated continuously and available at IRS.gov/Form1099K.


