In [None]:
import os
import tomli
import re
import time
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.vectorstores import Chroma
from chromadb import PersistentClient
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document
from langchain_core.document_loaders import BaseLoader
from langchain_cohere import CohereEmbeddings,ChatCohere
from langchain_core.runnables import RunnablePassthrough

In [None]:
%load_ext dotenv
%dotenv
# API key from free tier provided, will be deactivated after one week.
# Loading configs and API Keys
cohere_api_key = os.getenv('COHERE_API_KEY')
with open("../parameters.toml", "rb") as params:
          config = tomli.load(params)

#### Data Ingestion and Vector store creation

In [3]:
vector_store_path = config["rag"]["vector_store_path"]

In [4]:
# Creating a Persistent Client
persistent_client = PersistentClient(path=vector_store_path)
# Modifiable according to session
collection_name = config["rag"]["collection_name"]

In [5]:
# Custom Document Loader
class FinancialDocumentLoader(BaseLoader):
    """
    Custom loader for parsing financial documents from a plain text file.

    Expected format:
        Document 1: <Title Line>
        <Content>
        Document 2: <Title Line>
        <Content>
        ...

    Attributes:
        file_path (str): Path to the text file containing the financial documents.

    Methods:
        load(): Parses the file and returns a list of `Document` objects, each containing
                the content and metadata (source and title) for a single financial organization.
    """
    
    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        with open(self.file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Used regex to split by -> Document:X
        raw_docs = re.split(r'\nDocument \d+: ', text)
        documents = []

        for i, chunk in enumerate(raw_docs[1:], start=1):  # Skiped the first empty split
            title_end = chunk.find("\n")
            title = chunk[:title_end].strip()
            content = chunk[title_end+1:].strip()
            metadata = {"source": f"Document {i}", "title": title}

            documents.append(Document(page_content=content, metadata=metadata))

        return documents

In [6]:
# Ingest data
source_data_path = config["rag"]["source_file_location"]
loader = FinancialDocumentLoader(source_data_path)
docs = loader.load()

In [7]:
docs

[Document(metadata={'source': 'Document 1', 'title': 'Blue Horizon Investments'}, page_content='Description:\nBased in London, Blue Horizon Investments is known for its innovative portfolio strategies. However, irregular transaction patterns and rapid, unexplained fund movements have raised concerns about possible insider trading and manipulation.'),
 Document(metadata={'source': 'Document 2', 'title': 'Cascade Capital Management'}, page_content='Description:\nCascade Capital Management, a venture capital firm specializing in tech investments, has grown rapidly but relies on a complex network of subsidiary shell companies. This structure has attracted regulatory scrutiny regarding transparency and compliance.'),
 Document(metadata={'source': 'Document 3', 'title': 'Eclipse Global Holdings'}, page_content='Description:\nEclipse Global Holdings is a diversified conglomerate with interests in multiple sectors. Recent investigations have linked several of its subsidiaries to irregular cont

In [8]:
# Removing new line char to save processed tokens
for doc in docs:
    doc.page_content = ' '.join(doc.page_content.split())

In [9]:
# Embedder instantiated
embeddings = CohereEmbeddings(
    cohere_api_key=cohere_api_key,
    model=config["llm"]["embedding_model"],
)

In [10]:
# Vector store instantiated
vector_store_from_client = Chroma(
        client=persistent_client,
        collection_name=collection_name,
        embedding_function=embeddings,
    )

  vector_store_from_client = Chroma(


In [11]:
# Adding Documents into the vector store
vector_store_from_client.add_documents(documents=docs)

['bad90191-dbe2-4f2f-ad11-d08e8c1f5755',
 '9c9025b0-5580-468f-8b00-208b28522482',
 'c0056089-6439-463f-be75-1ae0417890e6',
 '5c09f301-477a-46a9-8095-a0a70e9a221d',
 'b7eeccf1-fff9-4f3d-9324-71cc3b631325',
 '696efc84-166a-49af-8520-91c93d90a0ab',
 'ac6915da-daf5-4a48-959c-886db74835bc',
 '7373d29a-a994-4eaf-baf4-aa29c6c8a964',
 '61392672-4d54-4537-a4c2-34218be61b8a',
 '24042249-85b3-4b73-a69d-69882b802cd9',
 'dcf2fefe-754c-42ef-bd10-720eae5dc161']

#### Creating three types of retrievers - MMR (Maximal Marginal Relevance), Similarity Score Threshold and Similarity Search 

In [12]:
# Creating the retriever (Maximal Marginal Relevance)
retriever_mmr = vector_store_from_client.as_retriever(
    search_type='mmr',
    search_kwargs={
        "k": 4,              # number of documents to return
        "fetch_k": 9,        # number of documents to consider before reranking with MMR
        "lambda_mult": 0.7   # high lambda focuses more on relevance than diversity
    }
)

In [13]:
# Creating the retriever (Similarity Score Threshold)
retriever_sst = vector_store_from_client.as_retriever(
    search_type="similarity_score_threshold", 
    search_kwargs={
        "score_threshold": 0.4
        }
)

In [14]:
# Creating the retriever (Similarity Search)
retriever_ss = vector_store_from_client.as_retriever(
    search_type='similarity',
    search_kwargs={
        "k": 4              
    }
)

### Comparing the Retrievers

In [15]:
# Assumes all retrievers return a list of Documents
def compare_retrievers(query, retrievers: dict, embedding_model):
    """
    Compare MMR, similarity, and threshold-based retrievers for a given query.
    
    Args:
        query (str): The user query.
        retrievers (dict): Dictionary of retriever_name -> retriever_instance.
        embedding_model: Embedding model (e.g., OpenAIEmbeddings()).
    
    Returns:
        pd.DataFrame: Comparison table of latency and relevance.
    """
    results = []
    for name, retriever in retrievers.items():
        start_time = time.time()
        docs = retriever.invoke(query)
        end_time = time.time()
        duration = end_time - start_time

        # Compute cosine similarity between query and each doc
        similarities = []
        for doc in docs:
            doc_embedding = embedding_model.embed_query(doc.page_content)  
            doc_vec = np.array(doc_embedding).reshape(1, -1)
            query_embedding = embedding_model.embed_query(query)
            query_vec = np.array(query_embedding).reshape(1, -1)
            sim = cosine_similarity(query_vec, doc_vec)[0][0]
            similarities.append(sim)

        avg_relevance = np.mean(similarities) if similarities else 0.0

        results.append({
            "Retriever": name,
            "Retrieved_Docs": len(docs),
            "Avg Relevance (Cosine)": round(avg_relevance, 4),
            "Retrieval Time (s)": round(duration, 4)
        })

    return pd.DataFrame(results)

In [None]:
# Define your retrievers
retrievers = {
    "MMR": retriever_mmr,
    "Similarity": retriever_ss,
    "Similarity + Threshold": retriever_sst
}

query = "Which organizations show signs of complex laundering structures?"

comparison_df = compare_retrievers(query, retrievers, embeddings)
comparison_df

No relevant docs were retrieved using the relevance score threshold 0.4


Unnamed: 0,Retriever,Retrieved_Docs,Avg Relevance (Cosine),Retrieval Time (s)
0,MMR,4,0.4525,0.377
1,Similarity,4,0.4596,0.3561
2,Similarity + Threshold,0,0.0,0.3606


In [17]:
retriever_ss.invoke("Which institutions demonstrate strong internal audits and transparent operations?")

[Document(metadata={'source': 'Document 7', 'title': 'Falcon Secure Bank'}, page_content='Description: Falcon Secure Bank is a digital-only institution acclaimed for its cutting-edge cybersecurity and rigorous internal audits. The bank consistently demonstrates strong compliance practices and transparent operations, earning the trust of its customers and regulators alike.'),
 Document(metadata={'source': 'Document 7', 'title': 'Falcon Secure Bank'}, page_content='Description: Falcon Secure Bank is a digital-only institution acclaimed for its cutting-edge cybersecurity and rigorous internal audits. The bank consistently demonstrates strong compliance practices and transparent operations, earning the trust of its customers and regulators alike.'),
 Document(metadata={'source': 'Document 7', 'title': 'Falcon Secure Bank'}, page_content='Description: Falcon Secure Bank is a digital-only institution acclaimed for its cutting-edge cybersecurity and rigorous internal audits. The bank consiste

In [18]:
retriever_mmr.invoke("Which institutions demonstrate strong internal audits and transparent operations?")

[Document(metadata={'source': 'Document 7', 'title': 'Falcon Secure Bank'}, page_content='Description: Falcon Secure Bank is a digital-only institution acclaimed for its cutting-edge cybersecurity and rigorous internal audits. The bank consistently demonstrates strong compliance practices and transparent operations, earning the trust of its customers and regulators alike.'),
 Document(metadata={'source': 'Document 7', 'title': 'Falcon Secure Bank'}, page_content='Description: Falcon Secure Bank is a digital-only institution acclaimed for its cutting-edge cybersecurity and rigorous internal audits. The bank consistently demonstrates strong compliance practices and transparent operations, earning the trust of its customers and regulators alike.'),
 Document(metadata={'source': 'Document 6', 'title': 'Delta Trade Corporation'}, page_content='Description: Delta Trade Corporation operates across Europe and Asia with a long-standing reputation for transparency. Its high-frequency trading is 

### Choosing maximal marginal relevance since 
- Upon multiple experiments the avg relevance was similar for both MMR and Similarity but the retrieval time is less in latter.
- But upon further experiments, it was noted that similarity search resulted in repeated retrievals, ignoring the relevance
- Similarity + Threshold as a high chance of missing documents (fetched only 1 relevant doc at threshold - 0.4)
- MMR chosen to avoid redudancy and maintain high relevance

### Prompt Engineering + LLM to construct coherent answers

In [20]:
chat = ChatCohere(cohere_api_key=cohere_api_key)

### Prompt Variations to test against - 
- Base Prompt (To establish a baseline for comparision)
- Risk Focused Framing (Emphasize the goal of risk detection for more focused and structured answers.)
- Regulatory tone with Evaluation Metric (Make the LLM evaluate)
- Chain-of-Thought Prompting (Encourage the LLM to reason)
- Focused Extraction (Improve precision by restricting to relevant names)
- Summarize + Analyze (Make the LLM reprocess the data to improve its understanding before answering.)

In [21]:
# TEMPLATES

# Base Prompt
TEMPLATE_base = """
Use the following context to answer the question:
{retrieved_documents}
Question: {query}
Answer:
"""

# Risk Focused Framing
TEMPLATE_risk_focused = """
You are a compliance analyst. Based on the context below, 
identify any signs of financial crime or compliance risks.
Context:
{retrieved_documents}
Query: {query}
Response:
"""

# Regulatory tone + eval metric
TEMPLATE_regu_eval = """
As a regulator, assess the following organizations based on context:
{retrieved_documents}

For each entity mentioned in the context:
- Assign a **Compliance Risk Level**: High, Medium, or Low
- Justify the rating in one sentence using evidence from the context.
Be precise, formal, and analytical.

Query: {query}
Response:
"""

# Chain of Thought
TEMPLATE_CoT = """
Analyze the context below step by step and determine
if any organizations show patterns related to financial misconduct.
Context:
{retrieved_documents}
Query: {query}
Step-by-step analysis:
"""

# Focused Extraction
TEMPLATE_focus_extract = """
Read the context below and extract only the organization names that 
match the criteria in the query.
Context:
{retrieved_documents}
Query: {query}
Matching organizations:
"""

# Summarize + Analyse
TEMPLATE_summarize = """
Summarize the key characteristics of each organization in the context, 
then answer the query.
Context:
{retrieved_documents}
Summary + Answer:
"""

In [22]:
prompt_template = PromptTemplate.from_template(TEMPLATE_regu_eval)
chain = ({'retrieved_documents':retriever_ss,
         'query':RunnablePassthrough()} | prompt_template | chat)

In [23]:
res = chain.invoke("Which organisations show signs of potential money laundering through complex structures")

In [24]:
print(res.content)

**Gemini Asset Management**  
**Compliance Risk Level**: High  
**Justification**: Gemini Asset Management exhibits high compliance risk due to unusually high commissions and inconsistent portfolio reporting, which are red flags for potential money laundering and fraudulent practices.  

**Eclipse Global Holdings**  
**Compliance Risk Level**: High  
**Justification**: Eclipse Global Holdings faces high compliance risk due to irregular contract awards and suspected kickback schemes across its subsidiaries, indicating weak internal controls and potential illicit financial activities.  

**Organizations showing signs of potential money laundering through complex structures**:  
- **Gemini Asset Management**: The anomalies in commissions and portfolio reporting suggest potential layering or integration of illicit funds through complex financial transactions.  
- **Eclipse Global Holdings**: The involvement of multiple subsidiaries in irregular contract awards and kickback schemes indicate

### Comparing results from prompt variations

In [25]:
prompt_templates = {
    "base": TEMPLATE_base,
    "risk_focused": TEMPLATE_risk_focused,
    "regulatory tone with evaluation" : TEMPLATE_regu_eval,
    "chain_of_thought": TEMPLATE_CoT,
    "focused_extraction": TEMPLATE_focus_extract,
    "summarize_analyze": TEMPLATE_summarize
}

In [26]:
def test_prompts(prompt_templates, query):
    """
    Evaluates multiple prompt templates for a given query and returns performance metrics.

    This function runs a set of prompt templates through a predefined LLM chain (including a retriever
    and chat model) and evaluates their performance based on token usage and response time.
    It returns the outputs along with performance metrics in a pandas DataFrame.

    Args:
        prompt_templates (dict): 
            A dictionary where keys are the names of the prompt templates and values are template strings.
        query (str): 
            The user query to be evaluated across all prompt templates.

    Returns:
        pd.DataFrame: 
            A DataFrame containing:
                - 'Prompt Name': Name of the prompt template
                - 'Output': LLM's generated response
                - 'Tokens Count - Input': Number of tokens in the prompt input
                - 'Tokens Count - Output': Number of tokens in the generated output
                - 'Tokens Consumed': Total tokens used (input + output)
                - 'Response Time (s)': Time taken to generate the response (in seconds)

    Note:
        - The function assumes existence of a `retriever_mmr` object for document retrieval 
          and a `chat` object representing the LLM pipeline.
    """
    results = []

    for name, template in prompt_templates.items():
        prompt_template = PromptTemplate.from_template(template)
        chain = ({'retrieved_documents':retriever_mmr,
         'query':RunnablePassthrough()} | prompt_template | chat)

        start_time = time.time()
        response = chain.invoke(query)
        elapsed_time = time.time() - start_time

        token_usage = response.additional_kwargs['token_count']

        results.append({
            "Prompt Name": name,
            "Output": response.content,
            "Tokens Count - Input": token_usage['input_tokens'],
            "Tokens Count - Output": token_usage['output_tokens'],
            "Tokens Consumed": sum(token_usage.values()),
            "Response Time (s)": round(elapsed_time, 2),
        })

    return pd.DataFrame(results)

In [27]:
results = test_prompts(prompt_templates,"Which organisations show signs of potential money laundering through complex structures")

### Initial observations 
- Depending on the usecase prompt variation can be chosen 
- If the task is focused solely on entity extraction, focused_extraction is good with response time of 3.10 seconds. 
- It is still required to dive deeper into each prompt variation before finalising since we still have no idea if the answers generated are relevant (Precision, Recall) 
- Map them to a clarity score. 
- In the end a note will be provided to clarify where to use which variant.

## Clarity Mapping
### A clear LLM output:

-Directly mentions the relevant organizations (by name).

-Avoids overly verbose or vague phrasing.

-Uses fewer unnecessary filler words.

-Is grammatically correct and coherent.

-Keeps token usage efficient

#### Two Functions created to measure LLM output performance - 
- calculate_precision_recall for retriever
- compute_clarity for LLM response

these functions will help us better understand which prompt variation is best in terms of finding the best answer with good token efficiency

In [28]:
test_set_ = { 'queries' : [
    "Which organizations show signs of potential money laundering?",
    "Identify firms flagged for insider trading or suspicious market manipulation.",
    "Which firms operate through complex shell company structures that may obscure transparency?",
    "Which institutions demonstrate strong internal audits and transparent operations?",
    "Find organizations that rely on opaque financial structures or mechanisms.",
    "Identify low-risk institutions with a clean track record in compliance."
],
             'expected_outputs' : [
    "Aurora Financial Services,Gemini Asset Management",
    "Blue Horizon Investments",
    "Cascade Capital Management",
    "Falcon Secure Bank,Jupiter Commodities Exchange,Kepler Financial Innovations,Lunar Investment Group",
    "Aurora Financial Services,Cascade Capital Management",
    "Delta Trade Corporation,Falcon Secure Bank,Lunar Investment Group,Kepler Financial Innovations,Ionex Brokerage Services"
             ]

}

In [29]:
def calculate_precision_recall(query, expected_orgs):
    """
    Calculates precision and recall for retrieved documents based on expected organization matches.

    This function uses a retriever to fetch documents based on a query and compares the titles
    of the retrieved documents against a list of expected organization names. It calculates:
    - Precision: Proportion of relevant organizations among the retrieved ones.
    - Recall: Proportion of relevant organizations successfully retrieved.
    - True Positives: Set of organizations correctly identified as relevant.

    Args:
        query (str): 
            The user query to retrieve documents for.
        expected_orgs (str): 
            A comma-separated string of expected organization names.

    Returns:
        tuple:
            - precision (float): Proportion of relevant documents among retrieved ones.
            - recall (float): Proportion of relevant documents that were successfully retrieved.
            - true_positives (set): Set of organization names that match both expected and retrieved titles.

    Note:
        - The function assumes the presence of a `retriever_mmr` object that returns documents
          with metadata containing a 'title' field representing the organization name.
    """
    retrieved_companies = []
    retrieved_docs = retriever_mmr.invoke(query)
    for i in retrieved_docs:
        retrieved_companies.append(i.metadata['title'])
    expected_orgs = expected_orgs.split(",")
    expected_set = set(expected_orgs)
    retrieved_set = set(retrieved_companies)

    true_positives = expected_set.intersection(retrieved_set)
      
    precision = len(true_positives) / len(retrieved_companies) if retrieved_companies else 0
    recall = len(true_positives) / len(expected_orgs) if expected_orgs else 0

    return round(precision, 2), round(recall, 2), true_positives     


In [30]:
def compute_clarity(response, matches):
    """
    Computes a clarity score for an LLM-generated response based on named entity density 
    and lexical repetition.

    Clarity is evaluated as a combination of:
    - Named Entity Density: The proportion of matched organization names
      relative to the total word count in the response.
    - Repetition Penalty: Penalizes excessive repetition of words in the output to 
      reward more informative and diverse content.

    Args:
        response (object): 
            An LLM response object containing the generated text.
        matches (list of str): 
            A list of organization names (matched entities) found in the expected output.

    Returns:
        float:
            A clarity score, where a higher value indicates clearer, denser, and 
            less repetitive output.

    Notes:
        - The function performs a basic cleaning by removing commas and line breaks before analysis.
        - A higher named entity density and lower repetition lead to a higher clarity score.
    """
    llm_output_clean = response.content.lower().replace(",", " ").replace("\n", " ")
    # Named Entity Density
    delimiter_space = " "
    org_density = len(delimiter_space.join(matches).split()) / len(llm_output_clean.split())

    # Repetition Penalty: count repeating words
    words = re.findall(r'\b\w+\b', llm_output_clean)
    unique_words = set(words)
    repetition_ratio = 1 - (len(unique_words) / len(words)) if words else 0

    # Clarity = density * (1 - repetition penalty)
    clarity_score = org_density * (1 - repetition_ratio)

    return round(clarity_score * 100, 2)

In [31]:
# Let's examine which prompt variant provides most relevance
def llm_judge(query,expected_output,template_type):
    """
    Evaluates a single LLM response pipeline using a given prompt template to determine 
    retrieval relevance, output clarity, and overall response quality.

    This function:
    - Runs a retrieval-augmented generation (RAG) pipeline using the specified query and prompt template.
    - Measures response latency and token usage.
    - Calculates retrieval precision and recall by comparing retrieved company names with the expected output.
    - Computes clarity of the LLM's textual output based on named entity density and repetition.
    - Prints a detailed breakdown of the performance, including matched companies and clarity score.

    Args:
        query (str): 
            The input query to evaluate (e.g., "Which organizations show signs of money laundering?").
        expected_output (str): 
            A comma-separated string of expected organization names.
        template_type (str): 
            The prompt template text used to format the query before passing it to the LLM.

    Returns:
        None: 
            The function prints a detailed analysis to the console, including:
                - Retrieved companies
                - LLM output
                - Token consumption
                - Precision and recall metrics
                - Matched organizations
                - Clarity score
                - Response time in seconds

    """
    prompt_template = PromptTemplate.from_template(template_type)
    chain = ({'retrieved_documents':retriever_mmr,
         'query':RunnablePassthrough()} | prompt_template | chat)
    start_time = time.time()
    response = chain.invoke(query)
    elapsed_time = time.time() - start_time
    tokens_consumed = sum(response.additional_kwargs['token_count'].values())
    retrieved_companies = retriever_mmr.invoke(query)
    precision,recall,matches = calculate_precision_recall(query,expected_output)
    print(f"QUERY -> {query}")
    print("Retriever retrieved :")
    for i in retrieved_companies:
        print(i.metadata['title'])
    print(f"""
    ------------------------------------      
    expected companies = {expected_output}
    ------------------------------------      
    response : 
    {response.content}
    ------------------------------------
    total_tokens_consumed = {tokens_consumed}
    ------------------------------------
    retrieval precision = {precision}
    retrieval recall = {recall}
    ----------------------------------- 
    matched companies = {matches}
    -----------------------------------
    clarity_score = {compute_clarity(response,matches)}
    ---------------------------------
    Response Time (s): {round(elapsed_time, 2)} seconds
    """)

In [32]:
prompt_templates

{'base': '\nUse the following context to answer the question:\n{retrieved_documents}\nQuestion: {query}\nAnswer:\n',
 'risk_focused': '\nYou are a compliance analyst. Based on the context below, \nidentify any signs of financial crime or compliance risks.\nContext:\n{retrieved_documents}\nQuery: {query}\nResponse:\n',
 'regulatory tone with evaluation': '\nAs a regulator, assess the following organizations based on context:\n{retrieved_documents}\n\nFor each entity mentioned in the context:\n- Assign a **Compliance Risk Level**: High, Medium, or Low\n- Justify the rating in one sentence using evidence from the context.\nBe precise, formal, and analytical.\n\nQuery: {query}\nResponse:\n',
 'chain_of_thought': '\nAnalyze the context below step by step and determine\nif any organizations show patterns related to financial misconduct.\nContext:\n{retrieved_documents}\nQuery: {query}\nStep-by-step analysis:\n',
 'focused_extraction': '\nRead the context below and extract only the organizati

In [None]:
def generate_result_tables(prompt_templates, query, expected_output):
    """
    Runs a retrieval-augmented generation (RAG) pipeline for multiple prompt templates and 
    evaluates the performance of each variant on a given query.

    This function:
    - Executes an LLM chain for each prompt template using the specified query.
    - Measures token usage and response time.
    - Computes retrieval precision and recall against an expected list of organizations.
    - Calculates clarity of the LLM output using named entity density and repetition penalty.
    - Collects retrieved company titles and matched entities.

    Args:
        prompt_templates (dict): 
            A dictionary where keys are template names and values are prompt template strings.
        query (str): 
            The input query to be evaluated.
        expected_output (str): 
            A comma-separated string of expected organization names.

    Returns:
        pd.DataFrame: 
            A DataFrame where each row corresponds to a prompt template variant, 
            containing the following columns:
                - "Prompt Name"
                - "Output"
                - "Tokens Count - Input"
                - "Tokens Count - Output"
                - "Tokens Consumed"
                - "Retrieval Precision"
                - "Retrieval Recall"
                - "Retrieved Companies"
                - "matches"
                - "clarity"
                - "Response Time (s)"

    """
    results = []

    for name, template in prompt_templates.items():
        prompt_template = PromptTemplate.from_template(template)
        # not added string output parser, to access the response object metadata
        chain = ({'retrieved_documents':retriever_mmr,
         'query':RunnablePassthrough()} | prompt_template | chat)

        start_time = time.time()
        response = chain.invoke(query)
        elapsed_time = time.time() - start_time

        token_usage = response.additional_kwargs['token_count']

        retrieved_docs = retriever_mmr.invoke(query)
        retrieved_companies = []
        for i in retrieved_docs:
            retrieved_companies.append(i.metadata['title'])
            
        precision,recall,matches = calculate_precision_recall(query,expected_output)

        results.append({
            "Prompt Name": name,
            "Output": response.content,
            "Tokens Count - Input": token_usage['input_tokens'],
            "Tokens Count - Output": token_usage['output_tokens'],
            "Tokens Consumed": sum(token_usage.values()),
            "Retrieval Precision": precision,
            "Retrieval Recall": recall,
            "Retrieved Companies": retrieved_companies,
            "matches": matches,
            "clarity": compute_clarity(response,matches),
            "Response Time (s)": round(elapsed_time, 2),
        })

    return pd.DataFrame(results)

#### Test Query 1 - Which organizations show signs of potential money laundering?
 Answer - 'Aurora Financial Services,Gemini Asset Management'

In [35]:
q1_results = generate_result_tables(prompt_templates, test_set_["queries"][0], test_set_["expected_outputs"][0])

In [36]:
q1_results

Unnamed: 0,Prompt Name,Output,Tokens Count - Input,Tokens Count - Output,Tokens Consumed,Retrieval Precision,Retrieval Recall,Retrieved Companies,matches,clarity,Response Time (s)
0,base,"Based on the provided context, **Gemini Asset ...",786.0,136.0,922.0,0.25,0.5,"[Gemini Asset Management, Gemini Asset Managem...",{Gemini Asset Management},1.86,4.31
1,risk_focused,"Based on the provided context, **Gemini Asset ...",804.0,186.0,990.0,0.25,0.5,"[Gemini Asset Management, Gemini Asset Managem...",{Gemini Asset Management},1.6,5.02
2,regulatory tone with evaluation,### Compliance Risk Level Assessment:\n\n1. **...,839.0,209.0,1048.0,0.25,0.5,"[Gemini Asset Management, Gemini Asset Managem...",{Gemini Asset Management},1.3,5.8
3,chain_of_thought,To determine which organizations show signs of...,805.0,362.0,1167.0,0.25,0.5,"[Gemini Asset Management, Gemini Asset Managem...",{Gemini Asset Management},0.54,8.96
4,focused_extraction,"Based on the context provided, the organizatio...",801.0,73.0,874.0,0.25,0.5,"[Gemini Asset Management, Gemini Asset Managem...",{Gemini Asset Management},4.09,2.49
5,summarize_analyze,### Summary of Key Characteristics:\n\n1. **Ge...,789.0,242.0,1031.0,0.25,0.5,"[Gemini Asset Management, Gemini Asset Managem...",{Gemini Asset Management},1.13,6.63


#### Test Query 2 - Identify firms flagged for insider trading or suspicious market manipulation.
 Answer - 'Blue Horizon Investments'

In [37]:
q2_results = generate_result_tables(prompt_templates, test_set_["queries"][1], test_set_["expected_outputs"][1])

In [38]:
q2_results

Unnamed: 0,Prompt Name,Output,Tokens Count - Input,Tokens Count - Output,Tokens Consumed,Retrieval Precision,Retrieval Recall,Retrieved Companies,matches,clarity,Response Time (s)
0,base,"Based on the provided context, the firm flagge...",784.0,54.0,838.0,0.25,1.0,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Blue Horizon Investments},5.25,1.98
1,risk_focused,"Based on the provided context, the firm flagge...",802.0,55.0,857.0,0.25,1.0,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Blue Horizon Investments},6.19,2.05
2,regulatory tone with evaluation,### Compliance Risk Level Assessments:\n\n1. *...,837.0,185.0,1022.0,0.25,1.0,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Blue Horizon Investments},1.5,4.97
3,chain_of_thought,To determine if any organizations show pattern...,803.0,449.0,1252.0,0.25,1.0,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Blue Horizon Investments},0.44,10.85
4,focused_extraction,"Based on the query criteria, the organization ...",799.0,28.0,827.0,0.25,1.0,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Blue Horizon Investments},14.21,1.38
5,summarize_analyze,### Summary of Key Characteristics:\n\n1. **Ec...,785.0,227.0,1012.0,0.25,1.0,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Blue Horizon Investments},1.17,5.96


#### Test Query 3 - Which firms operate through complex shell company structures that may obscure transparency?
 Answer - 'Cascade Capital Management'

In [39]:
q3_results = generate_result_tables(prompt_templates, test_set_["queries"][2], test_set_["expected_outputs"][2])

In [40]:
q3_results

Unnamed: 0,Prompt Name,Output,Tokens Count - Input,Tokens Count - Output,Tokens Consumed,Retrieval Precision,Retrieval Recall,Retrieved Companies,matches,clarity,Response Time (s)
0,base,"Based on the provided context, **Cascade Capit...",787.0,62.0,849.0,0.25,1.0,"[Cascade Capital Management, Cascade Capital M...",{Cascade Capital Management},5.25,2.07
1,risk_focused,"Based on the provided context, **Cascade Capit...",805.0,214.0,1019.0,0.25,1.0,"[Cascade Capital Management, Cascade Capital M...",{Cascade Capital Management},1.3,5.8
2,regulatory tone with evaluation,**Compliance Risk Assessment:**\n\n1. **Cascad...,840.0,182.0,1022.0,0.25,1.0,"[Cascade Capital Management, Cascade Capital M...",{Cascade Capital Management},1.83,5.2
3,chain_of_thought,**Step 1: Identify the organizations mentioned...,806.0,341.0,1147.0,0.25,1.0,"[Cascade Capital Management, Cascade Capital M...",{Cascade Capital Management},0.52,9.79
4,focused_extraction,"Based on the provided context, the organizatio...",802.0,31.0,833.0,0.25,1.0,"[Cascade Capital Management, Cascade Capital M...",{Cascade Capital Management},11.86,1.42
5,summarize_analyze,### Summary of Key Characteristics:\n\n1. **Ca...,786.0,282.0,1068.0,0.25,1.0,"[Cascade Capital Management, Cascade Capital M...",{Cascade Capital Management},0.91,7.86


#### Test Query 4 - Which institutions demonstrate strong internal audits and transparent operations?
 Answer - 'Falcon Secure Bank,Jupiter Commodities Exchange,Kepler Financial Innovations,Lunar Investment Group'

In [41]:
q4_results = generate_result_tables(prompt_templates, test_set_["queries"][3], test_set_["expected_outputs"][3])

In [42]:
q4_results

Unnamed: 0,Prompt Name,Output,Tokens Count - Input,Tokens Count - Output,Tokens Consumed,Retrieval Precision,Retrieval Recall,Retrieved Companies,matches,clarity,Response Time (s)
0,base,"Based on the provided context, the institution...",795.0,184.0,979.0,0.5,0.5,"[Falcon Secure Bank, Falcon Secure Bank, Delta...","{Falcon Secure Bank, Kepler Financial Innovati...",2.29,4.62
1,risk_focused,"Based on the provided context, the following i...",813.0,232.0,1045.0,0.5,0.5,"[Falcon Secure Bank, Falcon Secure Bank, Delta...","{Falcon Secure Bank, Kepler Financial Innovati...",1.87,5.57
2,regulatory tone with evaluation,### Compliance Risk Level Assessment:\n\n1. **...,848.0,221.0,1069.0,0.5,0.5,"[Falcon Secure Bank, Falcon Secure Bank, Delta...","{Falcon Secure Bank, Kepler Financial Innovati...",2.08,5.57
3,chain_of_thought,To determine which institutions demonstrate st...,814.0,462.0,1276.0,0.5,0.5,"[Falcon Secure Bank, Falcon Secure Bank, Delta...","{Falcon Secure Bank, Kepler Financial Innovati...",0.84,12.18
4,focused_extraction,"Based on the provided context, the organizatio...",810.0,52.0,862.0,0.5,0.5,"[Falcon Secure Bank, Falcon Secure Bank, Delta...","{Falcon Secure Bank, Kepler Financial Innovati...",12.96,1.87
5,summarize_analyze,### Summary of Key Characteristics:\n\n1. **Fa...,797.0,273.0,1070.0,0.5,0.5,"[Falcon Secure Bank, Falcon Secure Bank, Delta...","{Falcon Secure Bank, Kepler Financial Innovati...",2.39,7.06


#### Test Query 5 - Find organizations that rely on opaque financial structures or mechanisms.
 Answer - 'Aurora Financial Services,Cascade Capital Management'

In [43]:
q5_results = generate_result_tables(prompt_templates, test_set_["queries"][4], test_set_["expected_outputs"][4])

In [44]:
q5_results

Unnamed: 0,Prompt Name,Output,Tokens Count - Input,Tokens Count - Output,Tokens Consumed,Retrieval Precision,Retrieval Recall,Retrieved Companies,matches,clarity,Response Time (s)
0,base,"Based on the provided context, the organizatio...",790.0,144.0,934.0,0.25,0.5,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Cascade Capital Management},1.75,4.08
1,risk_focused,"Based on the provided context, the following o...",808.0,320.0,1128.0,0.25,0.5,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Cascade Capital Management},0.68,7.96
2,regulatory tone with evaluation,**Assessment of Organizations Based on Complia...,843.0,231.0,1074.0,0.25,0.5,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Cascade Capital Management},1.1,5.57
3,chain_of_thought,To determine if any organizations show pattern...,809.0,425.0,1234.0,0.25,0.5,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Cascade Capital Management},0.43,9.24
4,focused_extraction,"Based on the provided context, the organizatio...",805.0,74.0,879.0,0.25,0.5,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Cascade Capital Management},5.33,2.46
5,summarize_analyze,### Summary of Key Characteristics:\n\n1. **Ec...,791.0,220.0,1011.0,0.25,0.5,"[Eclipse Global Holdings, Eclipse Global Holdi...",{Cascade Capital Management},1.14,5.95


#### Test Query 6 - Identify low-risk institutions with a clean track record in compliance.
 Answer - 'Delta Trade Corporation,Falcon Secure Bank,Lunar Investment Group,Kepler Financial Innovations,Ionex Brokerage Services'

In [45]:
q6_results = generate_result_tables(prompt_templates, test_set_["queries"][5], test_set_["expected_outputs"][5])

In [46]:
q6_results

Unnamed: 0,Prompt Name,Output,Tokens Count - Input,Tokens Count - Output,Tokens Consumed,Retrieval Precision,Retrieval Recall,Retrieved Companies,matches,clarity,Response Time (s)
0,base,"Based on the provided context, the following i...",794.0,181.0,975.0,0.75,0.6,"[Delta Trade Corporation, Delta Trade Corporat...","{Delta Trade Corporation, Falcon Secure Bank, ...",5.08,4.47
1,risk_focused,"Based on the provided context, the following i...",812.0,248.0,1060.0,0.75,0.6,"[Delta Trade Corporation, Delta Trade Corporat...","{Delta Trade Corporation, Falcon Secure Bank, ...",3.13,6.56
2,regulatory tone with evaluation,**Delta Trade Corporation** \n**Compliance Ri...,847.0,229.0,1076.0,0.75,0.6,"[Delta Trade Corporation, Delta Trade Corporat...","{Delta Trade Corporation, Falcon Secure Bank, ...",3.49,5.47
3,chain_of_thought,To determine if any organizations show pattern...,813.0,544.0,1357.0,0.75,0.6,"[Delta Trade Corporation, Delta Trade Corporat...","{Delta Trade Corporation, Falcon Secure Bank, ...",0.95,13.77
4,focused_extraction,Based on the query to identify low-risk instit...,809.0,89.0,898.0,0.75,0.6,"[Delta Trade Corporation, Delta Trade Corporat...","{Delta Trade Corporation, Falcon Secure Bank, ...",10.55,2.89
5,summarize_analyze,### Summary of Key Characteristics:\n\n1. **De...,793.0,270.0,1063.0,0.75,0.6,"[Delta Trade Corporation, Delta Trade Corporat...","{Delta Trade Corporation, Falcon Secure Bank, ...",3.02,7.06


### Final Observations: These four are reliable with their own pros and cons, Chose as per the usecase :

#### Chain-of-Thought 

- It consistently produces the most detailed responses (highest output tokens) but also has the longest response times, going up to 15.88 seconds .
- Excellent for nuanced analysis, but trade-off is speed and token consumption.

#### Focused Extraction 

- Despite being the most concise (lowest output tokens), it shines in precision when targeting specific entities.
- Clarity ratings are  high, implying the response is crisp and easy to interpret despite being short.

#### Regulatory Tone with Evaluation -  Balances Formality & Utility

- Strong middle ground across precision, clarity, and response time.
- Presents data in a compliance-report format, ideal for risk evaluation dashboards.

#### Summarize_Analyze 

- Moderate in everything— token usage, response time, clarity, and entity retrieval.
- Great default mode for dashboards that need balanced insights without extremes.

### Some output examples -

In [48]:
llm_judge(test_set_["queries"][0],test_set_["expected_outputs"][0],prompt_templates['risk_focused'])

QUERY -> Which organizations show signs of potential money laundering?
Retriever retrieved :
Gemini Asset Management
Gemini Asset Management
Eclipse Global Holdings
Cascade Capital Management

    ------------------------------------      
    expected companies = Aurora Financial Services,Gemini Asset Management
    ------------------------------------      
    response : 
    Based on the provided context, **Gemini Asset Management** shows signs of potential money laundering. The key indicators include:

1. **Unusually High Commissions**: Excessive or unexplained commissions can be a red flag for money laundering, as they may be used to disguise illicit funds as legitimate income.  
2. **Inconsistent Portfolio Reporting**: Inconsistencies in reporting can indicate attempts to obfuscate the true source or movement of funds, a common tactic in money laundering schemes.  

These anomalies have sparked concerns over potential money laundering and fraudulent practices, making Gemini Asse

In [49]:
llm_judge(test_set_["queries"][1],test_set_["expected_outputs"][1],prompt_templates['chain_of_thought'])

QUERY -> Identify firms flagged for insider trading or suspicious market manipulation.
Retriever retrieved :
Eclipse Global Holdings
Eclipse Global Holdings
Gemini Asset Management
Blue Horizon Investments

    ------------------------------------      
    expected companies = Blue Horizon Investments
    ------------------------------------      
    response : 
    To determine if any organizations show patterns related to financial misconduct, specifically insider trading or suspicious market manipulation, let's analyze the provided context step by step:

---

### **Step 1: Review Each Document for Relevant Information**
1. **Eclipse Global Holdings (Document 3)**  
   - Linked to irregular contract awards and suspected kickback schemes.  
   - Concerns about internal controls.  
   - **No mention of insider trading or market manipulation.**  

2. **Gemini Asset Management (Document 4)**  
   - Spotlighted for unusually high commissions and inconsistent portfolio reporting.  
   - 

In [50]:
llm_judge(test_set_["queries"][3],test_set_["expected_outputs"][3],prompt_templates['focused_extraction'])

QUERY -> Which institutions demonstrate strong internal audits and transparent operations?
Retriever retrieved :
Falcon Secure Bank
Falcon Secure Bank
Delta Trade Corporation
Kepler Financial Innovations

    ------------------------------------      
    expected companies = Falcon Secure Bank,Jupiter Commodities Exchange,Kepler Financial Innovations,Lunar Investment Group
    ------------------------------------      
    response : 
    Based on the provided context, the organization that demonstrates strong internal audits and transparent operations is:

- **Falcon Secure Bank**
    ------------------------------------
    total_tokens_consumed = 838.0
    ------------------------------------
    retrieval precision = 0.5
    retrieval recall = 0.5
    ----------------------------------- 
    matched companies = {'Falcon Secure Bank', 'Kepler Financial Innovations'}
    -----------------------------------
    clarity_score = 28.42
    ---------------------------------
    Response 

In [None]:
# Summarising to compare different prompt variations average performance
df = [q1_results,q2_results,q3_results,q4_results,q5_results,q6_results]
full_df = pd.concat(df, ignore_index=True)
summary = full_df.groupby('Prompt Name').agg({
        'Tokens Count - Input': 'mean',
        'Tokens Count - Output': 'mean',
        'Tokens Consumed': 'mean',
        'Retrieval Precision': 'mean',
        'Retrieval Recall': 'mean',
        'clarity': 'mean',
        'Response Time (s)': 'mean'
    }).reset_index()
summary = summary.round(2)
summary.columns = [
        'Prompt Name',
        'Average Tokens Count - Input',
        'Average Tokens Count - Output',
        'Average Tokens Consumed',
        'Average Precision',
        'Average Recall',
        'Average Clarity Score',
        'Average Response Time (s)'
    ]

In [54]:
summary

Unnamed: 0,Prompt Name,Average Tokens Count - Input,Average Tokens Count - Output,Average Tokens Consumed,Average Precision,Average Recall,Average Clarity Score,Average Response Time (s)
0,base,789.33,126.83,916.17,0.38,0.68,3.58,3.59
1,chain_of_thought,808.33,430.5,1238.83,0.38,0.68,0.62,10.8
2,focused_extraction,804.33,57.83,862.17,0.38,0.68,9.83,2.08
3,regulatory tone with evaluation,842.33,209.5,1051.83,0.38,0.68,1.88,5.43
4,risk_focused,807.33,209.17,1016.5,0.38,0.68,2.46,5.49
5,summarize_analyze,790.17,252.33,1042.5,0.38,0.68,1.63,6.75
