### Importing all the libraries

In [16]:
import pandas as pd
import os
import gzip
import json
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    Language,
)
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from tqdm import tqdm
from typing import List, Dict, Optional
from sentence_transformers import CrossEncoder
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import logging
logging.disable(logging.CRITICAL)

### Reading the corpus data

In [17]:
# Add parent directory to Python path to find the financerag package
import sys
from pathlib import Path

# Add the parent directory (FinanceRAG root) to the path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [18]:
data_dir = "../data"
records = []

file_path = os.path.join(data_dir, "financebench_corpus.jsonl", "corpus.jsonl")
with open(file_path, "rt", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        data["source_file"] = "financebench_corpus.jsonl"
        records.append(data)

# Convert all records to a pandas DataFrame
df = pd.DataFrame(records)

In [19]:
# unique records
print(len(df['_id'].unique()))

# null records
print(len(df.isna().sum()))

df.dropna(inplace=True)
df.drop_duplicates('_id',inplace=True)
print(len(df['_id'].unique()))

180
4
180


In [20]:
df.head()

Unnamed: 0,_id,title,text,source_file
0,dd2af2336,PEPSICO_2022_10K,"6) Africa, Middle East and South Asia (AMESA),...",financebench_corpus.jsonl
1,dd2acf5c0,BOEING_2022_10K,We derive a significant portion of our revenue...,financebench_corpus.jsonl
2,dd2ad12e4,COCACOLA_2022_10K,THE COCA-COLA COMPANY AND SUBSIDIARIES\nCONSOL...,financebench_corpus.jsonl
3,dd2af3272,PEPSICO_2022_10K,Note 3 Restructuring and Impairment Charges\n2...,financebench_corpus.jsonl
4,dd2ade412,LOCKHEEDMARTIN_2020_10K,Table of Contents\nLockheed Martin Corporation...,financebench_corpus.jsonl


### Breaking Texts to Chunks

In [21]:
CHAR_SIZES = [64, 128, 256, 368, 512]
RECURSIVE_SIZES = CHAR_SIZES
RECURSIVE_OVERLAP = 20

In [22]:
def rowdict_iter(df: pd.DataFrame):
    cols = list(df.columns)
    for vals in df.itertuples(index=False, name=None):
        yield dict(zip(cols, vals))

def _coerce_text(x: Optional[str]) -> str:
    if x is None:
        return ""
    if isinstance(x, float) and pd.isna(x):
        return ""
    return str(x)

def _emit_rows(base_row: Dict, splitter_name: str, chunk_size: int, chunk_overlap: int, chunks: List[str]) -> List[Dict]:
    return [
        {
            **base_row,
            "splitter": splitter_name,
            "chunk_size": chunk_size,
            "chunk_overlap": chunk_overlap,
            "chunk_index": i,
            "chunk_text": ch,
        }
        for i, ch in enumerate(chunks)
    ]

def _chunk_all_rows_with_splitter(df: pd.DataFrame, splitter, splitter_name: str, size: int, overlap: int) -> List[Dict]:
    rows: List[Dict] = []
    for rd in rowdict_iter(df):
        base = {
            "_id": rd.get("_id"),
            "title": rd.get("title"),
            "source_file": rd.get("source_file"),
        }
        text = _coerce_text(rd.get("text", ""))
        if not text:
            continue
        chunks = splitter.split_text(text)
        rows.extend(_emit_rows(base, f"{splitter_name}_{size}", size, overlap, chunks))
    return rows

def make_all_chunks_with_docs(df: pd.DataFrame) -> pd.DataFrame:
    if df.index.name and df.index.name not in df.columns:
        df = df.reset_index()

    all_rows: List[Dict] = []
    steps = ["character", "recursive"]

    with tqdm(total=len(steps), desc="Chunking pipeline", ncols=100) as pbar:
        for size in CHAR_SIZES:
            s = CharacterTextSplitter(chunk_size=size, chunk_overlap=0)
            all_rows.extend(_chunk_all_rows_with_splitter(df, s, "character", size, 0))
        pbar.update(1)

        for size in RECURSIVE_SIZES:
            s = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=RECURSIVE_OVERLAP)
            all_rows.extend(_chunk_all_rows_with_splitter(df, s, "recursive", size, RECURSIVE_OVERLAP))
        pbar.update(1)

    chunks_df = pd.DataFrame(all_rows)
    
    cols = ["_id", "title", "source_file", "splitter", "chunk_size", "chunk_overlap", "chunk_index", "chunk_text"]
    chunks_df = chunks_df[[c for c in cols if c in chunks_df.columns] + [c for c in chunks_df.columns if c not in cols]]
    return chunks_df

In [23]:
df.head()

Unnamed: 0,_id,title,text,source_file
0,dd2af2336,PEPSICO_2022_10K,"6) Africa, Middle East and South Asia (AMESA),...",financebench_corpus.jsonl
1,dd2acf5c0,BOEING_2022_10K,We derive a significant portion of our revenue...,financebench_corpus.jsonl
2,dd2ad12e4,COCACOLA_2022_10K,THE COCA-COLA COMPANY AND SUBSIDIARIES\nCONSOL...,financebench_corpus.jsonl
3,dd2af3272,PEPSICO_2022_10K,Note 3 Restructuring and Impairment Charges\n2...,financebench_corpus.jsonl
4,dd2ade412,LOCKHEEDMARTIN_2020_10K,Table of Contents\nLockheed Martin Corporation...,financebench_corpus.jsonl


In [24]:
chunks_df = make_all_chunks_with_docs(df)

Chunking pipeline: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00,  5.94it/s]


In [25]:
chunks_df.groupby("splitter").size().sort_values(ascending=True)

splitter
character_512     284
character_368     310
character_256     346
character_128     440
character_64      572
recursive_512     628
recursive_368     846
recursive_256    1246
recursive_128    2671
recursive_64     5766
dtype: int64

In [26]:
chunks_df.head()

Unnamed: 0,_id,title,source_file,splitter,chunk_size,chunk_overlap,chunk_index,chunk_text
0,dd2af2336,PEPSICO_2022_10K,financebench_corpus.jsonl,character_64,64,0,0,"6) Africa, Middle East and South Asia (AMESA),..."
1,dd2acf5c0,BOEING_2022_10K,financebench_corpus.jsonl,character_64,64,0,0,We derive a significant portion of our revenue...
2,dd2ad12e4,COCACOLA_2022_10K,financebench_corpus.jsonl,character_64,64,0,0,THE COCA-COLA COMPANY AND SUBSIDIARIES\nCONSOL...
3,dd2af3272,PEPSICO_2022_10K,financebench_corpus.jsonl,character_64,64,0,0,Note 3 Restructuring and Impairment Charges\n2...
4,dd2ade412,LOCKHEEDMARTIN_2020_10K,financebench_corpus.jsonl,character_64,64,0,0,Table of Contents\nLockheed Martin Corporation...


### Creating Vector DB

In [27]:
PARENT_DIR = "./vectordbs"

EMBED_MODEL_NAME = "intfloat/e5-small-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL_NAME,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"batch_size": 64}
)

In [None]:
def build_chroma_for_splitter(
    chunks_df: pd.DataFrame,
    splitter_name: str,
    parent_dir: str = PARENT_DIR,
) -> Chroma:
    sub = chunks_df[chunks_df["splitter"] == splitter_name].copy()
    sub = sub[sub["chunk_text"].notna() & (sub["chunk_text"].str.len() > 0)]
    texts = sub["chunk_text"].astype(str).tolist()
    metadatas: List[Dict] = sub[["_id", "source_file", "splitter", "chunk_index"]].to_dict("records")

    persist_dir = os.path.join(parent_dir, f"chroma_{splitter_name}")
    os.makedirs(persist_dir, exist_ok=True)

    db = Chroma.from_texts(
        texts=texts,
        embedding=embeddings,
        metadatas=metadatas,
        persist_directory=persist_dir,
        collection_name=f"col_{splitter_name}"
    )
    db.persist()
    return db

def load_chroma(splitter_name: str, parent_dir: str = PARENT_DIR) -> Chroma:
    persist_dir = os.path.join(parent_dir, f"chroma_{splitter_name}")
    return Chroma(
        persist_directory=persist_dir,
        collection_name=f"col_{splitter_name}",
        embedding_function=embeddings
    )

splitters_to_build = sorted(chunks_df["splitter"].unique().tolist())

built = {}
for sp in splitters_to_build:
    print(f"ðŸ”§ Building Chroma for splitter: {sp}")
    built[sp] = build_chroma_for_splitter(chunks_df, sp, parent_dir=PARENT_DIR)

In [None]:
search_text = "How does Boeing's effective tax rate in FY2022 compare to FY2021?"
splitters = ["character_512", "recursive_512"]

def search_chroma(splitter):
    db = Chroma(
        persist_directory=f"{PARENT_DIR}/chroma_{splitter}",
        collection_name=f"col_{splitter}",
        embedding_function=embeddings,
    )
    retriever = db.as_retriever(search_kwargs={"k": 5})
    return retriever.get_relevant_documents(search_text)

for sp in splitters:
    print(f"\n=== {sp} ===")
    for i, d in enumerate(search_chroma(sp), 1):
        meta = d.metadata or {}
        print(f"[{i}] {meta.get('_id')} | {meta.get('source_file')} | {meta.get('chunk_index')}")
        print(d.page_content[:250].replace("\n", " ") + "...\n")

### Filtering Top 5 results with reranker

In [None]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2", device="cpu")

def rerank_top_k(query, docs, top_n=5):
    pairs = [(query, d.page_content) for d in docs]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)[:top_n]
    out = []
    for d, s in ranked:
        meta = d.metadata or {}
        out.append({
            "score": float(s),
            "_id": meta.get("_id"),
            "source_file": meta.get("source_file"),
            "chunk_index": meta.get("chunk_index"),
            "text": d.page_content
        })
    return out

for sp in splitters: 
    print(f"\n===== {sp} | Reranked Top 5 =====")
    retrieved = search_chroma(sp)
    top5 = rerank_top_k(search_text, retrieved, top_n=5)
    for i, r in enumerate(top5, 1):
        print(f"[{i}] score={r['score']:.3f}  id={r['_id']}  chunk={r['chunk_index']}  file={r['source_file']}")
        print(r["text"][:300].replace("\n", " ") + ("..." if len(r["text"]) > 300 else ""))
        print()

### Evalaution vs Comparative Analysis

In [None]:
data_dir = "data"
records = []

file_path = os.path.join(data_dir, "financebench_queries.jsonl.gz")
with gzip.open(file_path, "rt", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        records.append(data)

# Convert all records to a pandas DataFrame
df_test = pd.DataFrame(records)
df_test = df_test[['_id','text']]
df_test.columns = ['query_id','text']

In [None]:
df_eval = pd.read_csv('FinanceBench_qrels.tsv', sep='\t')

In [None]:
len(df_eval['query_id'].unique())

In [None]:
df_eval = df_eval.merge(df_test,on='query_id',how='left')

In [None]:
df_eval.head()

In [None]:
SPLITTERS = splitters_to_build
TOP_K_RETRIEVE = 10
TOP_K_RERANK = 5

def load_chroma(splitter):
    return Chroma(
        persist_directory=f"{PARENT_DIR}/chroma_{splitter}",
        collection_name=f"col_{splitter}",
        embedding_function=embeddings,
    )

def retrieve_docs(db, query_text, k=TOP_K_RETRIEVE):
    retriever = db.as_retriever(search_kwargs={"k": k})
    return retriever.get_relevant_documents(query_text)

def rerank_docs(query_text, docs, top_n=TOP_K_RERANK):
    pairs = [(query_text, d.page_content) for d in docs]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return ranked[:top_n]

results = []

for splitter in SPLITTERS:
    print(f"\nðŸ”Ž Evaluating splitter: {splitter}")
    db = load_chroma(splitter)
    labels = []

    for _, row in tqdm(df_eval.iterrows(), total=len(df_eval)):
        corpus_id = row["corpus_id"]
        query_text = row["text"]

        retrieved_docs = retrieve_docs(db, query_text, k=TOP_K_RETRIEVE)

        reranked = rerank_docs(query_text, retrieved_docs, top_n=TOP_K_RERANK)

        top_ids = [d.metadata.get("_id") for d, _ in reranked if d.metadata and "_id" in d.metadata]

        label = 1 if corpus_id in top_ids else 0
        labels.append(label)

    df_eval[f"label_{splitter}_rerank"] = labels
    results.append((splitter, sum(labels), len(labels), sum(labels) / len(labels)))

In [None]:
df_summary = pd.DataFrame(results, columns=["splitter", "correct", "total", "accuracy"])

In [None]:
df_summary

In [None]:
chunks_df = chunks_df.groupby("splitter").size().sort_values(ascending=True).reset_index()

In [None]:
chunks_df.columns = ['splitter','chunkSize']

In [None]:
df_summary = chunks_df.merge(df_summary[['splitter','accuracy']],on='splitter',how='left')

In [None]:
df_summary['accuracy'] = round(df_summary['accuracy'] * 100,1)

In [None]:
df_summary

In [None]:
df_summary['splitter_type'] = df_summary['splitter'].apply(lambda x: 'character' if 'character' in x else 'recursive')
df_summary['char_size'] = df_summary['splitter'].str.extract(r'(\d+)').astype(int)

sns.relplot(data=df_summary, x='chunkSize', y='accuracy', hue='char_size',
            col='splitter_type', kind='scatter', palette='viridis', height=4, aspect=1.2)

plt.gca().invert_xaxis()
plt.show()

#### Character Splitters

Accuracy remains stable between 72â€“76% across all chunk sizes (64â€“512).

Shows minimal sensitivity to chunk size â€” even smaller chunks retain strong performance.

Indicates consistent context preservation and balanced granularity.

Ideal for CPU-based or small-scale RAG systems where simplicity and stability are preferred.

#### Recursive Splitters

Accuracy drops sharply for smaller chunk sizes (64 â†’ ~32%).

Performs best at larger chunk sizes (368â€“512) with accuracy near 76%.

Shows high sensitivity to chunk size â€” small chunks cause excessive fragmentation.

Likely struggles with structured content (e.g., tables, lists) when chunks are too fine-grained.

#### Overall Insights

CharacterTextSplitter is more robust and reliable across configurations.

RecursiveCharacterTextSplitter can perform well but requires careful tuning of chunk size.

For general RAG use cases, character splitters provide better trade-off between accuracy, speed, and stability.