## üì¶ Installation

In [1]:
# Install required packages
!pip install -q sentence-transformers faiss-cpu FlagEmbedding rank-bm25

## üìö Imports

In [2]:
# Core Libraries
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from typing import List, Dict, Tuple
import re

# Embedding & Retrieval
from sentence_transformers import SentenceTransformer
import faiss

# Reranking
from FlagEmbedding import FlagReranker

# BM25
from rank_bm25 import BM25Okapi

# Utils
import warnings
warnings.filterwarnings('ignore')
import logging
logging.disable(logging.CRITICAL)

In [4]:
# Check GPU
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    device = 'cuda'
else:
    print("Running on CPU")
    device = 'cpu'

CUDA available: True
GPU: NVIDIA GeForce RTX 3050 Laptop GPU


## ‚öôÔ∏è Configuration

In [5]:
CONFIG = {
    'data_dir': '../data',
    'output_file': 'submission_improved.csv',
    
    'datasets': [
        'convfinqa', 'financebench', 'finder',
        'finqa', 'finqabench', 'multiheirtt', 'tatqa'
    ],
    
    # Models - UPGRADED
    'embedding_model': 'BAAI/bge-large-en-v1.5',
    'reranker_model': 'BAAI/bge-reranker-v2-m3',
    
    # Chunking - NEW
    'use_chunking': True,
    'chunk_size': 512,
    'chunk_overlap': 128,
    'chunk_aggregation': 'max',
    'preserve_tables': True,
    
    # Hybrid - NEW
    'use_hybrid': True,
    'hybrid_alpha': 0.6,
    
    # Parameters - INCREASED
    'top_k_retrieval': 100,  # from 50
    'top_k_rerank': 50,
    'top_k_final': 10,
    
    'embed_batch_size': 16,
    'rerank_batch_size': 16,
    'max_length': 512,
    
    'eval_on_qrels': True,
}

print("‚úÖ Configuration loaded")
for k, v in CONFIG.items():
    if k != 'datasets':
        print(f"  {k}: {v}")

‚úÖ Configuration loaded
  data_dir: ../data
  output_file: submission_improved.csv
  embedding_model: BAAI/bge-large-en-v1.5
  reranker_model: BAAI/bge-reranker-v2-m3
  use_chunking: True
  chunk_size: 512
  chunk_overlap: 128
  chunk_aggregation: max
  preserve_tables: True
  use_hybrid: True
  hybrid_alpha: 0.6
  top_k_retrieval: 100
  top_k_rerank: 50
  top_k_final: 10
  embed_batch_size: 16
  rerank_batch_size: 16
  max_length: 512
  eval_on_qrels: True


## üîß Helper Functions

In [6]:
def load_jsonl_data(dataset_name: str, data_dir: str):
    """Load corpus, queries, and qrels"""
    corpus_path = os.path.join(data_dir, f"{dataset_name}_corpus.jsonl", "corpus.jsonl")
    queries_path = os.path.join(data_dir, f"{dataset_name}_queries.jsonl", "queries.jsonl")
    qrels_path = os.path.join(data_dir, f"{dataset_name}_qrels.tsv")
    
    corpus_df = pd.read_json(corpus_path, lines=True)
    queries_df = pd.read_json(queries_path, lines=True)
    
    qrels_df = None
    if os.path.exists(qrels_path):
        qrels_df = pd.read_csv(qrels_path, sep='\t')
    
    print(f"  Loaded {len(corpus_df)} docs, {len(queries_df)} queries")
    return corpus_df, queries_df, qrels_df

In [7]:
def detect_tables(text: str) -> List[Tuple[int, int]]:
    """Detect table regions using heuristics"""
    lines = text.split('\n')
    table_regions = []
    in_table = False
    table_start = 0
    
    for i, line in enumerate(lines):
        is_table = (
            line.count('|') >= 2 or
            line.count('\t') >= 2 or
            len(re.findall(r'\s{3,}', line)) >= 2
        )
        
        if is_table and not in_table:
            in_table = True
            table_start = max(0, i - 1)
        elif not is_table and in_table:
            in_table = False
            table_end = min(len(lines), i + 1)
            if table_end - table_start >= 3:
                table_regions.append((table_start, table_end))
    
    if in_table:
        table_regions.append((table_start, len(lines)))
    
    return table_regions


def chunk_text_simple(text: str, chunk_size: int, overlap: int) -> List[str]:
    """Sliding window chunking"""
    words = text.split()
    if len(words) <= chunk_size:
        return [text]
    
    chunks = []
    step = chunk_size - overlap
    for i in range(0, len(words), step):
        chunk = ' '.join(words[i:i + chunk_size])
        if len(words[i:i + chunk_size]) >= 50:
            chunks.append(chunk)
    return chunks


def chunk_document_smart(doc_id: str, title: str, text: str, 
                        chunk_size: int, overlap: int, preserve_tables: bool):
    """Smart chunking with table preservation"""
    chunks = []
    
    if not preserve_tables or len(text) < 500:
        for i, chunk_text in enumerate(chunk_text_simple(text, chunk_size, overlap)):
            chunks.append({
                'chunk_id': f"{doc_id}_c{i}",
                'text': f"[{title}] {chunk_text}",
                'doc_id': doc_id
            })
        return chunks
    
    lines = text.split('\n')
    table_regions = detect_tables(text)
    
    if not table_regions:
        for i, chunk_text in enumerate(chunk_text_simple(text, chunk_size, overlap)):
            chunks.append({
                'chunk_id': f"{doc_id}_c{i}",
                'text': f"[{title}] {chunk_text}",
                'doc_id': doc_id
            })
        return chunks
    
    chunk_idx = 0
    prev_end = 0
    
    for table_start, table_end in table_regions:
        # Text before table
        if table_start > prev_end:
            before = '\n'.join(lines[prev_end:table_start])
            if before.strip():
                for chunk_text in chunk_text_simple(before, chunk_size, overlap):
                    chunks.append({
                        'chunk_id': f"{doc_id}_c{chunk_idx}",
                        'text': f"[{title}] {chunk_text}",
                        'doc_id': doc_id
                    })
                    chunk_idx += 1
        
        # Table as single chunk
        table_text = '\n'.join(lines[table_start:table_end])
        chunks.append({
            'chunk_id': f"{doc_id}_t{chunk_idx}",
            'text': f"[TABLE from {title}]\n{table_text}",
            'doc_id': doc_id
        })
        chunk_idx += 1
        prev_end = table_end
    
    # Text after table
    if prev_end < len(lines):
        after = '\n'.join(lines[prev_end:])
        if after.strip():
            for chunk_text in chunk_text_simple(after, chunk_size, overlap):
                chunks.append({
                    'chunk_id': f"{doc_id}_c{chunk_idx}",
                    'text': f"[{title}] {chunk_text}",
                    'doc_id': doc_id
                })
                chunk_idx += 1
    
    return chunks

In [8]:
def normalize_scores(scores: np.ndarray) -> np.ndarray:
    """Normalize to [0, 1]"""
    if scores.max() == scores.min():
        return np.ones_like(scores)
    return (scores - scores.min()) / (scores.max() - scores.min())


def hybrid_search(query_emb, query_text, faiss_index, bm25, corpus_texts, top_k, alpha=0.6):
    """Hybrid: Dense + BM25"""
    # Dense
    dense_scores, indices = faiss_index.search(
        query_emb.reshape(1, -1).astype('float32'), top_k * 2
    )
    dense_scores = dense_scores[0]
    indices = indices[0]
    
    # BM25
    query_tokens = query_text.lower().split()
    bm25_scores = bm25.get_scores(query_tokens)
    bm25_subset = bm25_scores[indices]
    
    # Normalize
    dense_norm = normalize_scores(dense_scores)
    bm25_norm = normalize_scores(bm25_subset)
    
    # Combine
    hybrid = alpha * dense_norm + (1 - alpha) * bm25_norm
    
    # Re-sort
    sorted_idx = np.argsort(hybrid)[::-1][:top_k]
    return hybrid[sorted_idx], indices[sorted_idx]


def aggregate_chunk_scores(chunk_scores: Dict, method='max') -> Dict:
    """Aggregate chunk scores to doc scores"""
    aggregated = {}
    for doc_id, scores in chunk_scores.items():
        if method == 'max':
            aggregated[doc_id] = max(scores)
        elif method == 'mean':
            aggregated[doc_id] = np.mean(scores)
        elif method == 'weighted':
            weights = np.array([1.0 / (i + 1) for i in range(len(scores))])
            weights = weights / weights.sum()
            aggregated[doc_id] = np.dot(scores, weights)
        else:
            aggregated[doc_id] = max(scores)
    return aggregated

In [9]:
def compute_ndcg(qrels: Dict, results: Dict, k=10) -> float:
    """Compute NDCG@k"""
    ndcg_scores = []
    for query_id, retrieved in results.items():
        if query_id not in qrels:
            continue
        
        relevant = qrels[query_id]
        retrieved_k = retrieved[:k]
        
        # DCG
        dcg = sum(relevant.get(doc_id, 0) / np.log2(i + 2) 
                  for i, doc_id in enumerate(retrieved_k))
        
        # IDCG
        ideal = sorted(relevant.values(), reverse=True)[:k]
        idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal))
        
        if idcg > 0:
            ndcg_scores.append(dcg / idcg)
    
    return np.mean(ndcg_scores) if ndcg_scores else 0.0


def evaluate_results(results_df, qrels_df, k=10):
    """Evaluate with qrels"""
    qrels = qrels_df.groupby('query_id').apply(
        lambda x: dict(zip(x['corpus_id'], x['score']))
    ).to_dict()
    
    results = results_df.groupby('query_id')['corpus_id'].apply(list).to_dict()
    
    ndcg = compute_ndcg(qrels, results, k)
    return {'NDCG@10': ndcg, 'num_queries': len(results), 'num_qrels': len(qrels)}

## ü§ñ Load Models

In [10]:
print("Loading models...")

# Embedding
print(f"\n1. Loading: {CONFIG['embedding_model']}")
embed_model = SentenceTransformer(CONFIG['embedding_model'], device=device)
print("   ‚úÖ Done")

# Reranker
print(f"\n2. Loading: {CONFIG['reranker_model']}")
reranker = FlagReranker(CONFIG['reranker_model'], use_fp16=(device=='cuda'))
print("   ‚úÖ Done")

print("\n‚úÖ All models loaded!")

Loading models...

1. Loading: BAAI/bge-large-en-v1.5
   ‚úÖ Done

2. Loading: BAAI/bge-reranker-v2-m3
   ‚úÖ Done

‚úÖ All models loaded!


## üîÑ Main Pipeline

In [11]:
def process_dataset_improved(dataset_name: str, config: Dict):
    """Improved pipeline"""
    print(f"\n{'='*60}")
    print(f"Processing: {dataset_name.upper()}")
    print(f"{'='*60}")
    
    # Load
    corpus_df, queries_df, qrels_df = load_jsonl_data(dataset_name, config['data_dir'])
    
    # Chunk
    print(f"\nüìÑ Chunking...")
    all_chunks = []
    chunk_to_doc = {}
    
    if config['use_chunking']:
        for _, row in tqdm(corpus_df.iterrows(), total=len(corpus_df), desc="Chunk"):
            chunks = chunk_document_smart(
                row['_id'], str(row.get('title', '')), str(row.get('text', '')),
                config['chunk_size'], config['chunk_overlap'], config['preserve_tables']
            )
            for c in chunks:
                all_chunks.append(c)
                chunk_to_doc[c['chunk_id']] = c['doc_id']
        print(f"   {len(all_chunks)} chunks from {len(corpus_df)} docs")
    else:
        for _, row in corpus_df.iterrows():
            doc_id = row['_id']
            text = f"[{row.get('title', '')}] {row.get('text', '')}"
            all_chunks.append({'chunk_id': doc_id, 'text': text, 'doc_id': doc_id})
            chunk_to_doc[doc_id] = doc_id
    
    chunk_texts = [c['text'] for c in all_chunks]
    chunk_ids = [c['chunk_id'] for c in all_chunks]
    
    # Embed
    print(f"\nüî¢ Embedding...")
    chunk_embeddings = embed_model.encode(
        chunk_texts, batch_size=config['embed_batch_size'],
        show_progress_bar=True, convert_to_numpy=True,
        normalize_embeddings=True, max_length=config['max_length']
    )
    
    # FAISS
    print(f"\nüîç Building FAISS...")
    index = faiss.IndexFlatIP(chunk_embeddings.shape[1])
    index.add(chunk_embeddings.astype('float32'))
    
    # BM25
    bm25 = None
    if config['use_hybrid']:
        print(f"\nüî§ Building BM25...")
        tokenized = [t.lower().split() for t in chunk_texts]
        bm25 = BM25Okapi(tokenized)
    
    del chunk_embeddings
    if device == 'cuda':
        torch.cuda.empty_cache()
    
    # Queries
    print(f"\nüéØ Processing queries...")
    query_texts = [str(r.get('text', '')) for _, r in queries_df.iterrows()]
    query_ids = queries_df['_id'].tolist()
    
    query_embeddings = embed_model.encode(
        query_texts, batch_size=config['embed_batch_size'],
        show_progress_bar=True, convert_to_numpy=True,
        normalize_embeddings=True, max_length=config['max_length']
    )
    
    # Retrieve
    results = []
    for i, query_id in enumerate(tqdm(query_ids, desc="Retrieve+Rerank")):
        query_emb = query_embeddings[i]
        query_text = query_texts[i]
        
        # Hybrid or dense
        if config['use_hybrid'] and bm25:
            scores, chunk_indices = hybrid_search(
                query_emb, query_text, index, bm25, chunk_texts,
                config['top_k_retrieval'], config['hybrid_alpha']
            )
        else:
            scores, chunk_indices = index.search(
                query_emb.reshape(1, -1).astype('float32'),
                config['top_k_retrieval']
            )
            scores, chunk_indices = scores[0], chunk_indices[0]
        
        # Aggregate to docs
        doc_scores = {}
        for idx, score in zip(chunk_indices, scores):
            doc_id = chunk_to_doc[chunk_ids[idx]]
            if doc_id not in doc_scores:
                doc_scores[doc_id] = []
            doc_scores[doc_id].append(float(score))
        
        doc_agg = aggregate_chunk_scores(doc_scores, config['chunk_aggregation'])
        sorted_docs = sorted(doc_agg.items(), key=lambda x: x[1], reverse=True)[:config['top_k_rerank']]
        
        # Rerank
        candidate_ids = [d[0] for d in sorted_docs]
        candidate_texts = [
            str(corpus_df[corpus_df['_id']==d]['text'].values[0])[:2048]
            for d in candidate_ids
        ]
        
        pairs = [[query_text, t] for t in candidate_texts]
        rerank_scores = reranker.compute_score(pairs)
        
        if not isinstance(rerank_scores, list):
            rerank_scores = [rerank_scores]
        
        scored = list(zip(candidate_ids, rerank_scores))
        scored.sort(key=lambda x: x[1], reverse=True)
        
        for doc_id, score in scored[:config['top_k_final']]:
            results.append({
                'query_id': query_id,
                'corpus_id': doc_id,
                'score': float(score)
            })
    
    results_df = pd.DataFrame(results)
    print(f"\n‚úÖ Done: {len(results_df)} results")
    
    # Evaluate
    eval_metrics = {}
    if config['eval_on_qrels'] and qrels_df is not None:
        print(f"\nüìä Evaluating...")
        eval_metrics = evaluate_results(results_df, qrels_df)
        print(f"   NDCG@10: {eval_metrics['NDCG@10']:.4f}")
    
    del query_embeddings, index
    if device == 'cuda':
        torch.cuda.empty_cache()
    
    return results_df, eval_metrics

## üöÄ Run Pipeline

In [12]:
all_results = []
all_eval = {}
failed = []

for dataset in CONFIG['datasets']:
    try:
        df_res, metrics = process_dataset_improved(dataset, CONFIG)
        all_results.append(df_res)
        if metrics:
            all_eval[dataset] = metrics
    except Exception as e:
        print(f"\n‚ùå Error: {dataset}: {e}")
        import traceback
        traceback.print_exc()
        failed.append(dataset)

print(f"\n{'='*60}")
print(f"‚úÖ Done: {len(all_results)}/{len(CONFIG['datasets'])}")
if failed:
    print(f"‚ùå Failed: {failed}")


Processing: CONVFINQA
  Loaded 2066 docs, 421 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2066/2066 [00:00<00:00, 2088.73it/s]


   7473 chunks from 2066 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 468/468 [08:23<00:00,  1.08s/it]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [00:02<00:00, 12.51it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 421/421 [23:15<00:00,  3.31s/it]



‚úÖ Done: 4210 results

üìä Evaluating...
   NDCG@10: 0.4830

Processing: FINANCEBENCH
  Loaded 180 docs, 150 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 180/180 [00:00<00:00, 3806.58it/s]


   183 chunks from 180 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:15<00:00,  1.29s/it]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:01<00:00,  5.38it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150/150 [08:11<00:00,  3.28s/it]



‚úÖ Done: 1500 results

üìä Evaluating...
   NDCG@10: 0.3439

Processing: FINDER
  Loaded 13867 docs, 216 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13867/13867 [00:01<00:00, 12599.14it/s]


   13929 chunks from 13867 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 871/871 [11:57<00:00,  1.21it/s]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:00<00:00, 16.64it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 216/216 [11:18<00:00,  3.14s/it]



‚úÖ Done: 2160 results

üìä Evaluating...
   NDCG@10: 0.3612

Processing: FINQA
  Loaded 2789 docs, 1147 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2789/2789 [00:00<00:00, 3890.86it/s]


   10106 chunks from 2789 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 632/632 [12:24<00:00,  1.18s/it]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 72/72 [00:07<00:00, 10.11it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1147/1147 [1:03:44<00:00,  3.33s/it]



‚úÖ Done: 11470 results

üìä Evaluating...
   NDCG@10: 0.4382

Processing: FINQABENCH
  Loaded 92 docs, 100 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 92/92 [00:00<00:00, 5641.46it/s]


   115 chunks from 92 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:06<00:00,  1.16it/s]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00,  9.41it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [05:28<00:00,  3.28s/it]



‚úÖ Done: 1000 results

üìä Evaluating...
   NDCG@10: 0.8662

Processing: MULTIHEIRTT
  Loaded 10475 docs, 974 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10475/10475 [00:01<00:00, 5316.53it/s]


   23084 chunks from 10475 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1443/1443 [32:12<00:00,  1.34s/it]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00,  9.55it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 974/974 [56:43<00:00,  3.49s/it]



‚úÖ Done: 9740 results

üìä Evaluating...
   NDCG@10: 0.1467

Processing: TATQA
  Loaded 2756 docs, 1663 queries

üìÑ Chunking...


Chunk: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2756/2756 [00:00<00:00, 5197.36it/s]


   5760 chunks from 2756 docs

üî¢ Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 360/360 [06:41<00:00,  1.12s/it]



üîç Building FAISS...

üî§ Building BM25...

üéØ Processing queries...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 104/104 [00:08<00:00, 12.82it/s]
Retrieve+Rerank: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1663/1663 [5:49:41<00:00, 12.62s/it]  


‚úÖ Done: 16630 results

üìä Evaluating...
   NDCG@10: 0.4935

‚úÖ Done: 7/7





## üìä Evaluation Summary

In [13]:
if all_eval:
    print("\nüìä Local Evaluation (30% qrels):")
    print("="*60)
    
    total_ndcg = 0
    total_queries = 0
    
    for ds, m in all_eval.items():
        print(f"\n{ds.upper()}: NDCG@10 = {m['NDCG@10']:.4f}")
        total_ndcg += m['NDCG@10'] * m['num_qrels']
        total_queries += m['num_qrels']
    
    if total_queries > 0:
        avg_ndcg = total_ndcg / total_queries
        print(f"\n{'='*60}")
        print(f"üìà AVERAGE NDCG@10: {avg_ndcg:.4f}")
        print(f"{'='*60}")
        
        baseline = 0.328
        gain = avg_ndcg - baseline
        gain_pct = (gain / baseline) * 100
        
        print(f"\nüéØ vs Baseline:")
        print(f"   Baseline: {baseline:.4f}")
        print(f"   Improved: {avg_ndcg:.4f}")
        print(f"   Gain: +{gain:.4f} ({gain_pct:+.1f}%)")
        
        if avg_ndcg >= 0.58:
            print(f"\nüèÜ Likely TOP 3!")
        elif avg_ndcg >= 0.50:
            print(f"\n‚úÖ Good progress, tune more!")
        else:
            print(f"\n‚ö†Ô∏è Need more work")


üìä Local Evaluation (30% qrels):

CONVFINQA: NDCG@10 = 0.4830

FINANCEBENCH: NDCG@10 = 0.3439

FINDER: NDCG@10 = 0.3612

FINQA: NDCG@10 = 0.4382

FINQABENCH: NDCG@10 = 0.8662

MULTIHEIRTT: NDCG@10 = 0.1467

TATQA: NDCG@10 = 0.4935

üìà AVERAGE NDCG@10: 0.4037

üéØ vs Baseline:
   Baseline: 0.3280
   Improved: 0.4037
   Gain: +0.0757 (+23.1%)

‚ö†Ô∏è Need more work


## üíæ Generate Submission

In [14]:
if all_results:
    final_df = pd.concat(all_results, ignore_index=True)
    submission_df = final_df[['query_id', 'corpus_id']]
    
    submission_df.to_csv(CONFIG['output_file'], index=False)
    
    print(f"\n‚úÖ Saved: {CONFIG['output_file']}")
    print(f"   Entries: {len(submission_df)}")
    print(f"   Queries: {submission_df['query_id'].nunique()}")
    
    print(f"\nüìã Sample:")
    print(submission_df.head(10))
    
    counts = submission_df.groupby('query_id').size()
    print(f"\nüîç Validation:")
    print(f"   Per query: {counts.value_counts().to_dict()}")
    if (counts == 10).all():
        print(f"   ‚úÖ All queries have 10 results")
else:
    print("\n‚ùå No results")


‚úÖ Saved: submission_improved.csv
   Entries: 46710
   Queries: 4671

üìã Sample:
    query_id  corpus_id
0  qd4982518  dd4c4f7aa
1  qd4982518  dd4bb016e
2  qd4982518  dd4b9f7f6
3  qd4982518  dd4bb5506
4  qd4982518  dd4bbdb16
5  qd4982518  dd4b87d18
6  qd4982518  dd4be45d6
7  qd4982518  dd4bd3790
8  qd4982518  dd4c0119a
9  qd4982518  dd4b89cbc

üîç Validation:
   Per query: {10: 4671}
   ‚úÖ All queries have 10 results


## üéØ Summary

In [None]:
print("\n" + "="*60)
print("üéâ IMPROVED PIPELINE COMPLETED!")
print("="*60)

print("\n‚úÖ Improvements:")
print("   1. Table-aware chunking")
print("   2. BGE-reranker-v2-m3 (SOTA)")
print("   3. Hybrid retrieval (BM25+Dense)")
print("   4. Local evaluation")
print("   5. Optimized parameters")

print("\nüíæ Next: Submit to Kaggle!")
print("="*60)


üéâ IMPROVED PIPELINE COMPLETED!

‚úÖ Improvements:
   1. Table-aware chunking
   2. BGE-reranker-v2-m3 (SOTA)
   3. Hybrid retrieval (BM25+Dense)
   4. Local evaluation
   5. Optimized parameters

üíæ Next: Submit to Kaggle!


: 