# Evaluate bo767 retrieval recall accuracy with NV-Ingest and Milvus

In this notebook, we'll use NV-ingest and LlamaIndex to get the end-to-end recall accuracy of a retrieval pipeline made up of NV-Ingest's extraction and embedding tasks and a Milvus vector database (VDB).

In [46]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
milvus_client.drop_collection(collection_name='text')
milvus_client.drop_collection(collection_name='tables')
milvus_client.drop_collection(collection_name='charts')
milvus_client.drop_collection(collection_name='multimodal')

In [None]:
from nv_ingest_client.client import Ingestor

ingestor = (
    Ingestor(message_client_hostname="localhost")
    .files("../data/bo767/*.pdf")
    .extract(
        extract_text=True,
        extract_tables=True,
        extract_charts=True,
        extract_images=False,
        text_depth="page",
    ).embed()
)

results = ingestor.ingest()

In [None]:
# Optional: save results
import pickle

filehandler = open('bo767_results.obj', 'wb')
pickle.dump(results, filehandler)

In [26]:
# Optional: load results
import pickle

filehandler = open('bo767_results.obj', 'rb')
results = pickle.load(filehandler)

In [37]:
len(results)

767

In [52]:
from nv_ingest_client.util.milvus import write_to_nvingest_collection, create_nvingest_collection, nvingest_retrieval

sparse = False
milvus_hostname = "localhost"
create_nvingest_collection("text", f"http://{milvus_hostname}:19530", sparse=sparse, gpu_search=True)
create_nvingest_collection("tables", f"http://{milvus_hostname}:19530", sparse=sparse, gpu_search=True)
create_nvingest_collection("charts", f"http://{milvus_hostname}:19530", sparse=sparse, gpu_search=True)
create_nvingest_collection("multimodal", f"http://{milvus_hostname}:19530", sparse=sparse, gpu_search=True)

In [39]:
text_results = [[element for element in results if element['document_type'] == 'text'] for results in results]
table_results = [[element for element in results if element['metadata']['content_metadata']['subtype'] == 'table'] for results in results]
chart_results = [[element for element in results if element['metadata']['content_metadata']['subtype'] == 'chart'] for results in results]

In [53]:
write_to_nvingest_collection(text_results, "text", sparse=sparse, milvus_uri=f"http://{milvus_hostname}:19530", minio_endpoint="localhost:9000")
write_to_nvingest_collection(table_results, "tables", sparse=sparse, milvus_uri=f"http://{milvus_hostname}:19530", minio_endpoint="localhost:9000")
write_to_nvingest_collection(chart_results, "charts", sparse=sparse, milvus_uri=f"http://{milvus_hostname}:19530", minio_endpoint="localhost:9000")
write_to_nvingest_collection(results, "multimodal", sparse=sparse, milvus_uri=f"http://{milvus_hostname}:19530", minio_endpoint="localhost:9000")

Wrote data to: [['0ca30e66-ca9e-4875-bab6-66535bee39ea/1.parquet']]
Start time: 2025-01-24 06:17:49
Imported row count: 45816
Bulk text upload took 54.15791320800781 s
Wrote data to: [['7f738ffc-7fd5-4ce3-86dd-b9ab9c515ddc/1.parquet']]
Start time: 2025-01-24 06:19:11
Imported row count: 27193
Bulk tables upload took 38.110397815704346 s
Wrote data to: [['e5460db7-5d86-47bc-981b-46c810cc9d45/1.parquet']]
Start time: 2025-01-24 06:20:12
Imported row count: 5667
Bulk charts upload took 17.05130910873413 s
Wrote data to: [['c61b35ef-4224-467d-ba98-fd5e5659b6b1/1.parquet']]
Start time: 2025-01-24 06:21:10
Imported row count: 78676
Bulk multimodal upload took 87.26382780075073 s


In [35]:
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np

def get_recall_scores(query_df, collection_name):
    hits = defaultdict(list)
    all_answers = nvingest_retrieval(
        df_query["query"].to_list(),
        collection_name,
        hybrid=sparse,
        embedding_endpoint="http://localhost:8012/v1",
        model_name="nvidia/llama-3.2-nv-embedqa-1b-v2",
        top_k=10,
        gpu_search=True,
    )

    for i in range(len(df_query)):
        expected_pdf_page = query_df['pdf_page'][i]
        retrieved_answers = all_answers[i]
        retrieved_pdfs = [os.path.basename(result['entity']['source']['source_id']).split('.')[0] for result in retrieved_answers]
        retrieved_pages = [str(result['entity']['content_metadata']['page_number']) for result in retrieved_answers]
        retrieved_pdf_pages = [f"{pdf}_{page}" for pdf, page in zip(retrieved_pdfs, retrieved_pages)]    

        for k in [1, 3, 5, 10]:
            hits[k].append(expected_pdf_page in retrieved_pdf_pages[:k])
    
    for k in hits:
        print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')

## Text Recall

In [65]:
import pandas as pd

df_query = pd.read_csv('../data/text_query_answer_gt_page.csv')
df_query.pdf = df_query.pdf.apply(lambda x: x.replace('.pdf',''))
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.gt_page}", axis=1) 
df_query

Unnamed: 0,pdf,query,answer,gt_page,pdf_page
0,1102434,How much was the ARtillery Intelligence projec...,$4.2 billion,19,1102434_19
1,1102434,How much revenue of AR advertising is expected...,$8.8 billion,3,1102434_3
2,1096078,What types of statistics were utilized by Rein...,descriptive statistics,3,1096078_3
3,1054125,What was the maximum amount requested for cond...,"$35,000.00",1,1054125_1
4,1246906,What is the median household income for the Ci...,"$53,278",7,1246906_7
...,...,...,...,...,...
483,2089825,Under the Climate Action and Low Carbon Develo...,Denis Naughten TD,0,2089825_0
484,2089825,How many organizations make up Stop Climate Ch...,30,5,2089825_5
485,2098077,What is the maximum length of Sai Yok bent-­to...,2.4 inches,1,2098077_1
486,2098077,What characteristic sets the Sai Yok Bent-toed...,enlarged thigh scales,1,2098077_1


In [55]:
get_recall_scores(df_query, "text")

  - Recall @1: 0.627
  - Recall @3: 0.826
  - Recall @5: 0.877
  - Recall @10: 0.914


## Table recall

In [64]:
df_query = pd.read_csv('../data/table_queries_cleaned_235.csv')[['query','pdf','page','table']]
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1)
df_query

Unnamed: 0,query,pdf,page,table,pdf_page
0,How much did Pendleton County spend out of the...,1003421,2,1003421_2_0,1003421_2
1,How many units are occupied by single families...,1008059,6,1008059_6_1,1008059_6
2,"In the Klamath county, what is the total valua...",1008059,6,1008059_6_1,1008059_6
3,How much did Nalco pay GRIDCO for electricity ...,1011810,21,1011810_21_0,1011810_21
4,How much coal is used at Alumina refinery of N...,1011810,21,1011810_21_2,1011810_21
...,...,...,...,...,...
230,How much is the rental income from water plant...,2407280,30,2407280_30_0,2407280_30
231,In 2020 how much were the supplemental taxes f...,2415001,65,not detected,2415001_65
232,"As of 2020, what is the total of collections a...",2415001,65,not detected,2415001_65
233,What was the net gain from the operations of t...,2416020,84,2416020_84_0,2416020_84


In [57]:
get_recall_scores(df_query, "tables")

  - Recall @1: 0.502
  - Recall @3: 0.732
  - Recall @5: 0.787
  - Recall @10: 0.855


## Chart Recall

In [58]:
df_query = pd.read_csv('../data/charts_with_page_num_fixed.csv')[['query','pdf','page']]
df_query['page'] = df_query['page']-1 # page -1 because the page number starts with 1 in that csv
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1) 
df_query

Unnamed: 0,query,pdf,page,pdf_page
0,What are the top three consumer complaint cate...,1009210,11,1009210_11
1,Which 3 categories did extremely well in terms...,1009210,11,1009210_11
2,What's the longest recent US recession?,1010876,0,1010876_0
3,Is the 12-Month default rate usually higher th...,1010876,0,1010876_0
4,Which allegation is submitted highest to RTAs ...,1014669,0,1014669_0
...,...,...,...,...
263,"After the 2008 recession, what percentage of p...",2384395,6,2384395_6
264,what were the top 3 major religious groups in ...,2392676,5,2392676_5
265,What percentage of people in the world identif...,2392676,5,2392676_5
266,"Between 2003 and 2019, has the household mortg...",2410699,189,2410699_189


In [59]:
get_recall_scores(df_query, "charts")

  - Recall @1: 0.612
  - Recall @3: 0.743
  - Recall @5: 0.795
  - Recall @10: 0.817


## Multimodal Recall

In [60]:
df_query = pd.read_csv('../data/text_query_answer_gt_page.csv').rename(columns={'gt_page':'page'})[['query','pdf','page']]
df_query.pdf = df_query.pdf.apply(lambda x: x.replace('.pdf',''))
df_query['modality'] = 'text'

df_query2 = pd.read_csv('../data/table_queries_cleaned_235.csv')[['query','pdf','page']]
df_query2['modality'] = 'table'

df_query3 = pd.read_csv('../data/charts_with_page_num_fixed.csv')[['query','pdf','page']]
df_query3['page'] = df_query3['page']-1 # page -1 because the page number starts with 1 in that csv
df_query3['modality'] = 'chart'

df_query = pd.concat([df_query, df_query2, df_query3]).reset_index(drop=True)

df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1) 
df_query

Unnamed: 0,query,pdf,page,modality,pdf_page
0,How much was the ARtillery Intelligence projec...,1102434,19,text,1102434_19
1,How much revenue of AR advertising is expected...,1102434,3,text,1102434_3
2,What types of statistics were utilized by Rein...,1096078,3,text,1096078_3
3,What was the maximum amount requested for cond...,1054125,1,text,1054125_1
4,What is the median household income for the Ci...,1246906,7,text,1246906_7
...,...,...,...,...,...
986,"After the 2008 recession, what percentage of p...",2384395,6,chart,2384395_6
987,what were the top 3 major religious groups in ...,2392676,5,chart,2392676_5
988,What percentage of people in the world identif...,2392676,5,chart,2392676_5
989,"Between 2003 and 2019, has the household mortg...",2410699,189,chart,2410699_189


In [61]:
get_recall_scores(df_query, "multimodal")

  - Recall @1: 0.554
  - Recall @3: 0.746
  - Recall @5: 0.807
  - Recall @10: 0.857
