In [1]:
# Pipeline Configuration
CONFIG = {
    # Paths
    'data_dir': '../data',
    'output_file': 'submission.csv',
    
    # Datasets to process (7 financial datasets)
    'datasets': [
        'convfinqa',
        'financebench', 
        'finder',
        'finqa',
        'finqabench',
        'multiheirtt',
        'tatqa'
    ],
    
    # Model names
    'embedding_model': 'BAAI/bge-m3',  # Supports up to 8192 tokens, good for financial documents
    'reranker_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',  # Fast and effective
    
    # Retrieval parameters
    'top_k_retrieval': 50,  # Retrieve top-50 candidates
    'top_k_final': 10,       # Rerank to top-10 for submission
    
    # Batch sizes (reduced to prevent OOM)
    'embed_batch_size': 8,   # Reduced from 32 - financial docs are very long
    'rerank_batch_size': 8,  # Reduced from 16
    
    # Text length limit (to prevent OOM)
    'max_length': 4096,      # Truncate very long documents (BGE-M3 supports 8192 but uses too much memory)
}

print("Configuration loaded:")
for key, value in CONFIG.items():
    if key != 'datasets':
        print(f"  {key}: {value}")
print(f"  datasets: {len(CONFIG['datasets'])} datasets")

Configuration loaded:
  data_dir: ../data
  output_file: submission.csv
  embedding_model: BAAI/bge-m3
  reranker_model: cross-encoder/ms-marco-MiniLM-L-6-v2
  top_k_retrieval: 50
  top_k_final: 10
  embed_batch_size: 8
  rerank_batch_size: 8
  max_length: 4096
  datasets: 7 datasets


# FinanceRAG - Full Retrieval Pipeline

**Objective:** Retrieve top 10 most relevant documents for each query across 7 financial datasets.

**Strategy:** 
1. Process each dataset separately (Divide & Conquer)
2. Use Bi-Encoder (BGE-M3) for fast retrieval (Top-50)
3. Use Cross-Encoder (BGE-Reranker) for precise reranking (Top-10)
4. Combine all results into submission file

**No Generation needed** - Pure Retrieval Task!

In [2]:
# Core Libraries
import os
import pandas as pd
import json
import numpy as np
from tqdm.auto import tqdm
from typing import List, Dict, Tuple

# Embedding & Retrieval
from sentence_transformers import SentenceTransformer
import faiss

# Reranking
from sentence_transformers import CrossEncoder

# Utils
import warnings
warnings.filterwarnings('ignore')

import logging
logging.disable(logging.CRITICAL)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check GPU availability and set device
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU:", torch.cuda.get_device_name(0))
    device = 'cuda'
else:
    print("Running on CPU")
    device = 'cpu'

print(f"\nUsing device: {device}")

CUDA available: False
Running on CPU

Using device: cpu


## Configuration

## Helper Functions

In [4]:
def load_jsonl_data(dataset_name: str, data_dir: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load corpus and queries for a given dataset.
    
    Note: In this project, corpus files are in subfolders:
    e.g., data/financebench_corpus.jsonl/corpus.jsonl
    """
    # Construct paths (corpus is in subfolder)
    corpus_path = os.path.join(data_dir, f"{dataset_name}_corpus.jsonl", "corpus.jsonl")
    queries_path = os.path.join(data_dir, f"{dataset_name}_queries.jsonl", "queries.jsonl")
    
    # Check if files exist
    if not os.path.exists(corpus_path):
        raise FileNotFoundError(f"Corpus not found: {corpus_path}")
    if not os.path.exists(queries_path):
        raise FileNotFoundError(f"Queries not found: {queries_path}")
    
    # Load data
    corpus_df = pd.read_json(corpus_path, lines=True)
    queries_df = pd.read_json(queries_path, lines=True)
    
    print(f"  Loaded {len(corpus_df)} corpus documents, {len(queries_df)} queries")
    
    return corpus_df, queries_df


def prepare_texts(df: pd.DataFrame, combine_title: bool = True) -> List[str]:
    """
    Prepare text from dataframe for embedding.
    
    For financial documents, combining title + text is crucial:
    - Title often contains company name, year, report type
    - Helps disambiguate between similar documents
    """
    if combine_title and 'title' in df.columns:
        # Combine title and text with proper formatting
        texts = []
        for _, row in df.iterrows():
            title = str(row.get('title', '')).strip()
            text = str(row.get('text', '')).strip()
            if title and text:
                combined = f"{title}. {text}" 
            elif title:
                combined = title
            else:
                combined = text
            texts.append(combined)
        return texts
    else:
        return df['text'].astype(str).tolist()


def build_faiss_index(embeddings: np.ndarray, use_gpu: bool = False) -> faiss.Index:
    """
    Build FAISS index for fast similarity search.
    
    Using IndexFlatIP (Inner Product) because BGE models output normalized vectors.
    """
    dimension = embeddings.shape[0]
    
    # Create index
    index = faiss.IndexFlatIP(dimension)
    
    # Move to GPU if available and requested
    if use_gpu and faiss.get_num_gpus() > 0:
        print("  Using GPU for FAISS")
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)
    
    return index

## Load Models (Once for All Datasets)

In [5]:
print("Loading models... This may take a few minutes on first run.")

# 1. Bi-Encoder for Retrieval (Fast, captures semantic similarity)
print(f"\n1. Loading embedding model: {CONFIG['embedding_model']}")
embed_model = SentenceTransformer(
    CONFIG['embedding_model'], 
    device=device,
    trust_remote_code=True  # Allow loading custom model architectures
)
print(f"   Model loaded on {device}")

# 2. Cross-Encoder for Reranking (Slower but more accurate)
print(f"\n2. Loading reranker model: {CONFIG['reranker_model']}")
reranker = CrossEncoder(
    CONFIG['reranker_model'], 
    device=device, 
    max_length=512
)
print(f"   Reranker loaded on {device}")

print("\n‚úÖ All models loaded successfully!")

Loading models... This may take a few minutes on first run.

1. Loading embedding model: BAAI/bge-m3
   Model loaded on cpu

2. Loading reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2
   Reranker loaded on cpu

‚úÖ All models loaded successfully!


## Main Processing Function

In [6]:
def process_dataset(dataset_name: str, config: Dict) -> pd.DataFrame:
    """
    Complete pipeline for one dataset:
    1. Load data
    2. Embed corpus
    3. Build FAISS index
    4. Retrieve top-K candidates
    5. Rerank to top-10
    6. Return results
    """
    print(f"\n{'='*60}")
    print(f"Processing: {dataset_name.upper()}")
    print(f"{'='*60}")
    
    # --- STEP 1: Load Data ---
    corpus_df, queries_df = load_jsonl_data(dataset_name, config['data_dir'])
    
    # Prepare texts for embedding
    corpus_texts = prepare_texts(corpus_df, combine_title=True)
    corpus_ids = corpus_df['_id'].tolist()
    
    query_texts = prepare_texts(queries_df, combine_title=False)  # Queries don't have titles
    query_ids = queries_df['_id'].tolist()
    
    # --- STEP 2: Embed Corpus ---
    print(f"\nüìä Embedding {len(corpus_texts)} documents...")
    max_length = config.get('max_length', 4096)
    corpus_embeddings = embed_model.encode(
        corpus_texts,
        batch_size=config['embed_batch_size'],
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
        max_length=max_length
    )
    
    # --- STEP 3: Build FAISS Index ---
    print(f"\nüîç Building FAISS index...")
    dimension = corpus_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(corpus_embeddings.astype('float32'))
    print(f"   Index built with {index.ntotal} vectors")
    
    # Free memory
    del corpus_embeddings
    if device == 'cuda':
        import torch
        torch.cuda.empty_cache()
    
    # --- STEP 4: Retrieve Top-K Candidates ---
    print(f"\nüéØ Retrieving top-{config['top_k_retrieval']} candidates for {len(query_texts)} queries...")
    query_embeddings = embed_model.encode(
        query_texts,
        batch_size=config['embed_batch_size'],
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
        max_length=max_length
    )
    
    # Search FAISS index
    distances, indices = index.search(
        query_embeddings.astype('float32'),
        config['top_k_retrieval']
    )
    
    # Free memory
    del query_embeddings
    if device == 'cuda':
        import torch
        torch.cuda.empty_cache()
    
    # --- STEP 5: Rerank to Top-10 ---
    print(f"\n‚ö° Reranking to top-{config['top_k_final']}...")
    results = []
    rerank_batch_size = config.get('rerank_batch_size', 8)
    
    for i, query_id in enumerate(tqdm(query_ids, desc="Reranking")):
        query_text = query_texts[i]
        
        # Get candidates from retrieval step
        candidate_indices = indices[i]
        candidate_texts = [corpus_texts[idx] for idx in candidate_indices]
        candidate_ids = [corpus_ids[idx] for idx in candidate_indices]
        
        # Truncate candidate texts to prevent OOM in reranker
        max_rerank_len = 512
        candidate_texts = [text[:max_rerank_len*4] for text in candidate_texts]
        
        # Create pairs for reranker
        pairs = [[query_text, doc_text] for doc_text in candidate_texts]
        
        # Get reranking scores
        scores = reranker.predict(pairs, show_progress_bar=False, batch_size=rerank_batch_size)
        
        # Sort by score and take top-10
        scored_candidates = list(zip(candidate_ids, scores))
        scored_candidates.sort(key=lambda x: x[1], reverse=True)
        top_10 = scored_candidates[:config['top_k_final']]
        
        # Store results
        for corpus_id, score in top_10:
            results.append({
                'query_id': query_id,
                'corpus_id': corpus_id,
                'score': float(score)
            })
    
    results_df = pd.DataFrame(results)
    print(f"\n‚úÖ Completed {dataset_name}: {len(results_df)} results")
    
    # Clear GPU cache
    if device == 'cuda':
        import torch
        torch.cuda.empty_cache()
        print(f"   GPU cache cleared")
    
    return results_df

## Run Pipeline for All Datasets

## Generate Submission File

In [7]:
all_results = []
failed_datasets = []

print(f"Starting pipeline for {len(CONFIG['datasets'])} datasets...\n")

for dataset_name in CONFIG['datasets']:
    try:
        df_results = process_dataset(dataset_name, CONFIG)
        all_results.append(df_results)
    except Exception as e:
        print(f"\n‚ùå Error processing {dataset_name}: {str(e)}")
        failed_datasets.append(dataset_name)
        continue

print(f"\n{'='*60}")
print(f"Pipeline completed!")
print(f"  Successful: {len(all_results)}/{len(CONFIG['datasets'])} datasets")
if failed_datasets:
    print(f"  Failed: {', '.join(failed_datasets)}")
print(f"{'='*60}")

Starting pipeline for 7 datasets...


Processing: CONVFINQA
  Loaded 2066 corpus documents, 421 queries

üìä Embedding 2066 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 259/259 [1:30:12<00:00, 20.90s/it]



üîç Building FAISS index...
   Index built with 2066 vectors

üéØ Retrieving top-50 candidates for 421 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 53/53 [00:22<00:00,  2.31it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 421/421 [18:58<00:00,  2.70s/it]



‚úÖ Completed convfinqa: 4210 results

Processing: FINANCEBENCH
  Loaded 180 corpus documents, 150 queries

üìä Embedding 180 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [02:42<00:00,  7.06s/it]



üîç Building FAISS index...
   Index built with 180 vectors

üéØ Retrieving top-50 candidates for 150 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19/19 [00:16<00:00,  1.17it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150/150 [06:54<00:00,  2.77s/it]



‚úÖ Completed financebench: 1500 results

Processing: FINDER
  Loaded 13867 corpus documents, 216 queries

üìä Embedding 13867 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1734/1734 [1:37:13<00:00,  3.36s/it] 



üîç Building FAISS index...
   Index built with 13867 vectors

üéØ Retrieving top-50 candidates for 216 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [00:09<00:00,  2.87it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 216/216 [07:01<00:00,  1.95s/it]



‚úÖ Completed finder: 2160 results

Processing: FINQA
  Loaded 2789 corpus documents, 1147 queries

üìä Embedding 2789 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 349/349 [1:55:54<00:00, 19.93s/it]  



üîç Building FAISS index...
   Index built with 2789 vectors

üéØ Retrieving top-50 candidates for 1147 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 144/144 [01:11<00:00,  2.02it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1147/1147 [52:20<00:00,  2.74s/it]



‚úÖ Completed finqa: 11470 results

Processing: FINQABENCH
  Loaded 92 corpus documents, 100 queries

üìä Embedding 92 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [01:48<00:00,  9.07s/it]



üîç Building FAISS index...
   Index built with 92 vectors

üéØ Retrieving top-50 candidates for 100 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:07<00:00,  1.78it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [04:14<00:00,  2.54s/it]



‚úÖ Completed finqabench: 1000 results

Processing: MULTIHEIRTT
  Loaded 10475 corpus documents, 974 queries

üìä Embedding 10475 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1310/1310 [7:31:57<00:00, 20.70s/it]   



üîç Building FAISS index...
   Index built with 10475 vectors

üéØ Retrieving top-50 candidates for 974 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 122/122 [01:07<00:00,  1.80it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 974/974 [45:52<00:00,  2.83s/it]



‚úÖ Completed multiheirtt: 9740 results

Processing: TATQA
  Loaded 2756 corpus documents, 1663 queries

üìä Embedding 2756 documents...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [57:11<00:00,  9.95s/it] 



üîç Building FAISS index...
   Index built with 2756 vectors

üéØ Retrieving top-50 candidates for 1663 queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 208/208 [01:27<00:00,  2.36it/s]



‚ö° Reranking to top-10...


Reranking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1663/1663 [2:10:52<00:00,  4.72s/it]    


‚úÖ Completed tatqa: 16630 results

Pipeline completed!
  Successful: 7/7 datasets





In [8]:
# Combine all results
if all_results:
    final_df = pd.concat(all_results, ignore_index=True)
    
    # Submission format: query_id, corpus_id (no score column)
    submission_df = final_df[['query_id', 'corpus_id']]
    
    # Save to CSV
    output_path = CONFIG['output_file']
    submission_df.to_csv(output_path, index=False)
    
    print(f"\n‚úÖ Submission file saved: {output_path}")
    print(f"   Total entries: {len(submission_df)}")
    print(f"   Unique queries: {submission_df['query_id'].nunique()}")
    print(f"   Expected format: query_id, corpus_id")
    
    # Show sample
    print(f"\nüìã First 10 rows:")
    print(submission_df.head(10))
    
    # Validation checks
    print(f"\nüîç Validation:")
    print(f"   - Each query should have 10 results: {submission_df.groupby('query_id').size().value_counts().to_dict()}")
    print(f"   - No null values: {submission_df.isnull().sum().sum() == 0}")
else:
    print("\n‚ùå No results to save. All datasets failed.")


‚úÖ Submission file saved: submission.csv
   Total entries: 46710
   Unique queries: 4671
   Expected format: query_id, corpus_id

üìã First 10 rows:
    query_id  corpus_id
0  qd4982518  dd4bb5506
1  qd4982518  dd4bb016e
2  qd4982518  dd4b9f7f6
3  qd4982518  dd4bf5c14
4  qd4982518  dd4971510
5  qd4982518  dd4be45d6
6  qd4982518  dd4bf6f9c
7  qd4982518  dd4c4f7aa
8  qd4982518  dd4bf1060
9  qd4982518  dd4b87d18

üîç Validation:
   - Each query should have 10 results: {10: 4671}
   - No null values: True


## Statistics & Analysis (Optional)

In [None]:
# Analyze results per dataset
if all_results and 'score' in final_df.columns:
    print("\nüìä Statistics by Dataset:")
    print("="*60)
    
    # Add dataset identifier (extract from query_id pattern)
    # Query IDs typically start with dataset name prefix
    
    for i, df in enumerate(all_results):
        dataset = CONFIG['datasets'][i] if i < len(CONFIG['datasets']) else f"Dataset_{i}"
        print(f"\n{dataset.upper()}:")
        print(f"  Queries: {df['query_id'].nunique()}")
        print(f"  Total results: {len(df)}")
        print(f"  Avg rerank score: {df['score'].mean():.4f}")
        print(f"  Min/Max score: {df['score'].min():.4f} / {df['score'].max():.4f}")
        
        # Check if all queries have exactly 10 results
        counts = df.groupby('query_id').size()
        if (counts == 10).all():
            print(f"  ‚úÖ All queries have exactly 10 results")
        else:
            print(f"  ‚ö†Ô∏è  Some queries don't have 10 results: {counts.value_counts().to_dict()}")


üìä Statistics by Dataset:

CONVFINQA:
  Queries: 421
  Total results: 4210
  Avg rerank score: 1.0205
  Min/Max score: -10.9718 / 8.8219
  ‚úÖ All queries have exactly 10 results

FINANCEBENCH:
  Queries: 150
  Total results: 1500
  Avg rerank score: -5.1912
  Min/Max score: -11.2769 / 8.8339
  ‚úÖ All queries have exactly 10 results

FINDER:
  Queries: 216
  Total results: 2160
  Avg rerank score: 0.8377
  Min/Max score: -10.2328 / 10.2082
  ‚úÖ All queries have exactly 10 results

FINQA:
  Queries: 1147
  Total results: 11470
  Avg rerank score: 1.0698
  Min/Max score: -10.9311 / 9.1062
  ‚úÖ All queries have exactly 10 results

FINQABENCH:
  Queries: 100
  Total results: 1000
  Avg rerank score: -0.9521
  Min/Max score: -11.2623 / 10.8378
  ‚úÖ All queries have exactly 10 results

MULTIHEIRTT:
  Queries: 974
  Total results: 9740
  Avg rerank score: 0.4115
  Min/Max score: -11.2165 / 9.1521
  ‚úÖ All queries have exactly 10 results

TATQA:
  Queries: 1663
  Total results: 16630
 

: 

---

## üéØ Next Steps for Improvement

Based on this baseline, you can improve performance by:

### 1. **Model Selection**
- Try `BAAI/bge-large-en-v1.5` for better quality (but slower)
- Try `BAAI/bge-reranker-v2-m3` for better reranking (needs FlagEmbedding library)
- Fine-tune models on financial domain data

### 2. **Text Preprocessing**
- Extract tables separately and format them better
- Handle multi-modal content (text + tables)
- Clean HTML artifacts, special characters

### 3. **Chunking Strategy**
- For long documents, split into chunks
- Use sliding window with overlap
- Aggregate scores from multiple chunks

### 4. **Retrieval Tuning**
- Adjust `top_k_retrieval` (try 100 instead of 50)
- Use hybrid search (BM25 + Dense)
- Add query expansion

### 5. **Reranking Optimization**
- Ensemble multiple rerankers
- Use domain-specific reranker
- Adjust reranking batch size for speed

### 6. **Post-Processing**
- Remove duplicate documents
- Apply business rules (e.g., prefer recent documents)
- Use metadata filtering

### 7. **Evaluation**
- Use qrels files to compute NDCG@10 locally
- Analyze failure cases
- Create validation split for tuning