# Recall accuracy with NV-Ingest and LlamaIndex

### Tables

Flush vector database otherwise every document you've uploaded previously will be there and could interfere with the results

from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
milvus_client.drop_collection('nv_ingest_collection')

In [1]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
milvus_client.list_collections()
milvus_client.get_collection_stats(collection_name='nv_ingest_collection')
# milvus_client.describe_collection(collection_name='nv_ingest_collection')

{'row_count': 5665}

In [16]:
res

{'delete_count': 0, 'cost': 0}

In [124]:
milvus_client.query(collection_name='nv_ingest_collection', filter="text like \"river\"", limit=1)


data: [] 

In [119]:
milvus_client.query(collection_name='nv_ingest_collection', output_fields=["source"], limit=1)


data: ["{'source': {'access_level': 1, 'collection_id': '', 'date_created': '2016-05-05T19:09:58', 'last_modified': '2016-05-05T19:09:58', 'partition_id': -1, 'source_id': '/datasets/bo767/1210913.pdf', 'source_location': '', 'source_name': '/datasets/bo767/1210913.pdf', 'source_type': 'PDF', 'summary': ''}, 'pk': 453232080492956107}"] 

Run NV-Ingest on bo767 and get and store embeddings

In [72]:
lst = sorted(glob.glob('../processed_docs/bo767/structured/*.pdf.metadata.json'))
len(lst)

662

In [73]:
dfs = []
for f in tqdm(lst):
    df = pd.read_json(f)
    df['pdf'] = int(os.path.basename(f).split('.')[0])
    dfs.append(df)
dfs = pd.concat(dfs)[['pdf','metadata']]
dfs.shape

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 662/662 [00:21<00:00, 30.50it/s]


(32863, 2)

In [86]:
dfs['type'] = dfs.metadata.apply(lambda x:x['content_metadata']['subtype'])
dfs['page'] = dfs.metadata.apply(lambda x:x['content_metadata']['page_number'])
dfs['table_metadata'] = dfs.metadata.apply(lambda x:x['table_metadata'])
dfs['table_content'] = dfs.table_metadata.apply(lambda x:x['table_content'])

In [87]:
 dfs.type.value_counts()

type
table    27198
chart     5665
Name: count, dtype: int64

In [3]:
import os
import glob
import logging
import time
import json
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import pandas as pd

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.embeddings.nvidia import NVIDIAEmbedding

# TODO: Add your NVIDIA API key here
os.environ['NVIDIA_API_KEY'] = '<YOUR_NVIDIA_API_KEY>'


Get test queries with expected result pdf and page

In [2]:
df_query = pd.read_csv('table_queries_cleaned_235.csv')[['query','pdf','page','table']]
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1)
df_query

Unnamed: 0,query,pdf,page,table,pdf_page
0,How much did Pendleton County spend out of the...,1003421,2,1003421_2_0,1003421_2
1,How many units are occupied by single families...,1008059,6,1008059_6_1,1008059_6
2,"In the Klamath county, what is the total valua...",1008059,6,1008059_6_1,1008059_6
3,How much did Nalco pay GRIDCO for electricity ...,1011810,21,1011810_21_0,1011810_21
4,How much coal is used at Alumina refinery of N...,1011810,21,1011810_21_2,1011810_21
...,...,...,...,...,...
230,How much is the rental income from water plant...,2407280,30,2407280_30_0,2407280_30
231,In 2020 how much were the supplemental taxes f...,2415001,65,not detected,2415001_65
232,"As of 2020, what is the total of collections a...",2415001,65,not detected,2415001_65
233,What was the net gain from the operations of t...,2416020,84,2416020_84_0,2416020_84


Connect LlamaIndex to our milvus microservice and create a retriever

In [3]:
embed_model = NVIDIAEmbedding(model="nvidia/nv-embedqa-e5-v5")

vector_store = MilvusVectorStore(
    uri="http://localhost:19530",
    collection_name="nv_ingest_collection",
    doc_id_field="pk",
    embedding_field="vector",
    text_key="text",
    dim=1024,
    output_fields=["source", "content_metadata"],
    overwrite=False
)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
retriever = index.as_retriever(similarity_top_k=10)

In [130]:
retriever._vector_store

MilvusVectorStore(stores_text=True, is_embedding_query=True, stores_node=True, uri='./milvus_llamaindex.db', token='', collection_name='nv_ingest_collection', dim=1024, embedding_field='vector', doc_id_field='pk', similarity_metric='IP', consistency_level='Session', overwrite=False, text_key='text', output_fields=['source', 'content_metadata'], index_config={}, search_config={}, collection_properties=None, batch_size=100, enable_sparse=False, sparse_embedding_field='sparse_embedding', sparse_embedding_function=None, hybrid_ranker='RRFRanker', hybrid_ranker_params={}, index_management=<IndexManagement.CREATE_IF_NOT_EXISTS: 'create_if_not_exists'>, scalar_field_names=None, scalar_field_types=None)

In [64]:
help(retriever.retrieve)

Help on method retrieve in module llama_index.core.base.base_retriever:

retrieve(str_or_query_bundle: Union[str, llama_index.core.schema.QueryBundle]) -> List[llama_index.core.schema.NodeWithScore] method of llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever instance
    Retrieve nodes given query.
    
    Args:
        str_or_query_bundle (QueryType): Either a query string or
            a QueryBundle object.



Use the retriever to calculate recall scores

In [4]:
hits = defaultdict(list)

for i in tqdm(range(len(df_query))):
    query = df_query['query'][i]
    expected_pdf_page = df_query['pdf_page'][i]
    retrieved_answers = retriever.retrieve(query)
    retrieved_pdfs = [os.path.basename(json.loads(node.json())["node"]["metadata"]["source"]["source_id"]).split('.')[0] for node in retrieved_answers]
    retrieved_pages = [str(json.loads(node.json())["node"]["metadata"]["content_metadata"]["page_number"]) for node in retrieved_answers]
    retrieved_pdf_pages = [f"{pdf}_{page}" for pdf, page in zip(retrieved_pdfs, retrieved_pages)]

    for k in [1, 3, 5, 10]:
        hits[k].append(expected_pdf_page in retrieved_pdf_pages[:k])

for k in hits:
    print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 235/235 [01:49<00:00,  2.15it/s]

  - Recall @1: 0.447
  - Recall @3: 0.651
  - Recall @5: 0.736
  - Recall @10: 0.796





In [99]:

for i in range(len(df_query)):
    query = df_query['query'][i]
    retrieved_answers = retriever.retrieve(query)

    if len(retrieved_answers) < 10:
        print(len(retrieved_answers))


8
8
8
7
8
9
9
5
5
9
8
9
9
9
9
6
9
9
8
8


### Charts

In [None]:
nv-ingest-cli \
  --doc=/datasets/bo767/*.pdf \
  --output_directory=./processed_docs/bo767_10_14/ \
  --task='extract:{"document_type": "pdf", "extract_method": "pdfium", "extract_text": "false", "extract_tables": "false", "extract_charts": "true", "extract_images": "false"}' \
  --task='embed:{"text": "false", "tables": "true"}' \
  --task='vdb_upload' \
  --client_host=localhost \
  --client_port=7670

In [5]:
df_query = pd.read_csv('charts_with_page_num_fixed.csv')[['query','pdf','page']]
df_query['page'] = df_query['page']-1 # page -1 because the page number starts with 1 in that csv
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1) 
df_query

Unnamed: 0,query,pdf,page,pdf_page
0,What are the top three consumer complaint cate...,1009210,11,1009210_11
1,Which 3 categories did extremely well in terms...,1009210,11,1009210_11
2,What's the longest recent US recession?,1010876,0,1010876_0
3,Is the 12-Month default rate usually higher th...,1010876,0,1010876_0
4,Which allegation is submitted highest to RTAs ...,1014669,0,1014669_0
...,...,...,...,...
263,"After the 2008 recession, what percentage of p...",2384395,6,2384395_6
264,what were the top 3 major religious groups in ...,2392676,5,2392676_5
265,What percentage of people in the world identif...,2392676,5,2392676_5
266,"Between 2003 and 2019, has the household mortg...",2410699,189,2410699_189


In [6]:
embed_model = NVIDIAEmbedding(model="NV-Embed-QA")

vector_store = MilvusVectorStore(
    uri="http://localhost:19530",
    collection_name="nv_ingest_collection",
    doc_id_field="pk",
    embedding_field="vector",
    text_key="text",
    dim=1024,
    output_fields=["source", "content_metadata"],
    overwrite=False
)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
retriever = index.as_retriever(similarity_top_k=10)

In [7]:
hits = defaultdict(list)

for i in tqdm(range(len(df_query))):
    query = df_query['query'][i]
    expected_answer = df_query['pdf_page'][i]
    retrieved_answers = [
        os.path.basename(json.loads(node.json())["node"]["metadata"]["source"]["source_id"]).split('.')[0] + "_" + str(json.loads(node.json())["node"]["metadata"]["content_metadata"]["page_number"]) for node in retriever.retrieve(query)
    ]

    for k in [1, 3, 5, 10]:
        hits[k].append(expected_answer in retrieved_answers[:k])

for k in hits:
    print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [02:16<00:00,  1.96it/s]

  - Recall @1: 0.593
  - Recall @3: 0.713
  - Recall @5: 0.757
  - Recall @10: 0.810





In [104]:
retriever

10