# Recall accuracy with NV-Ingest and LlamaIndex

### Tables

Flush vector database otherwise every document you've uploaded previously will be there and could interfere with the results

In [None]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
milvus_client.drop_collection('nv_ingest_collection')

Run NV-Ingest on bo767 and get and store embeddings

In [None]:
nv-ingest-cli \
  --doc=/datasets/bo767/*.pdf \
  --output_directory=./processed_docs/bo767_10_14/ \
  --task='extract:{"document_type": "pdf", "extract_method": "pdfium", "extract_text": "false", "extract_tables": "true", "extract_charts": "false", "extract_images": "false"}' \
  --task='embed:{"text": "false", "tables": "true"}' \
  --task='vdb_upload' \
  --client_host=localhost \
  --client_port=7670

In [None]:
import os
import logging
import time
import json
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import pandas as pd

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.embeddings.nvidia import NVIDIAEmbedding

# TODO: Add your NVIDIA API key here
os.environ['NVIDIA_API_KEY'] = '<YOUR_NVIDIA_API_KEY>'


Get test queries with expected result pdf and page

In [None]:
df_query = pd.read_csv('table_queries_cleaned_235.csv')[['query','pdf','page','table']]
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1)
df_query

Connect LlamaIndex to our milvus microservice and create a retriever

In [None]:
embed_model = NVIDIAEmbedding(model="NV-Embed-QA")

vector_store = MilvusVectorStore(
    uri="http://localhost:19530",
    collection_name="nv_ingest_collection",
    doc_id_field="pk",
    embedding_field="vector",
    text_key="text",
    dim=1024,
    output_fields=["source", "content_metadata"],
    overwrite=False
)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
retriever = index.as_retriever(similarity_top_k=10)

Use the retriever to calculate recall scores

In [None]:
hits = defaultdict(list)

for i in tqdm(range(len(df_query))):
    query = df_query['query'][i]
    expected_answer = df_query['pdf_page'][i]
    retrieved_answers = [
        os.path.basename(json.loads(node.json())["node"]["metadata"]["source"]["source_id"]).split('.')[0] + "_" + str(json.loads(node.json())["node"]["metadata"]["content_metadata"]["page_number"]) for node in retriever.retrieve(query)
    ]

    for k in [1, 3, 5, 10]:
        hits[k].append(expected_answer in retrieved_answers[:k])

for k in hits:
    print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')

### Charts

In [None]:
nv-ingest-cli \
  --doc=/datasets/bo767/*.pdf \
  --output_directory=./processed_docs/bo767_10_14/ \
  --task='extract:{"document_type": "pdf", "extract_method": "pdfium", "extract_text": "false", "extract_tables": "false", "extract_charts": "true", "extract_images": "false"}' \
  --task='embed:{"text": "false", "tables": "true"}' \
  --task='vdb_upload' \
  --client_host=localhost \
  --client_port=7670

In [None]:
df_query = pd.read_csv('charts_with_page_num_fixed.csv')[['query','pdf','page']]
df_query['page'] = df_query['page']-1 # page -1 because the page number starts with 1 in that csv
df_query['pdf_page'] = df_query.apply(lambda x: f"{x.pdf}_{x.page}", axis=1) 
df_query

In [None]:
embed_model = NVIDIAEmbedding(model="NV-Embed-QA")

vector_store = MilvusVectorStore(
    uri="http://localhost:19530",
    collection_name="nv_ingest_collection",
    doc_id_field="pk",
    embedding_field="vector",
    text_key="text",
    dim=1024,
    output_fields=["source", "content_metadata"],
    overwrite=False
)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
retriever = index.as_retriever(similarity_top_k=10)

In [None]:
hits = defaultdict(list)

for i in tqdm(range(len(df_query))):
    query = df_query['query'][i]
    expected_answer = df_query['pdf_page'][i]
    retrieved_answers = [
        os.path.basename(json.loads(node.json())["node"]["metadata"]["source"]["source_id"]).split('.')[0] + "_" + str(json.loads(node.json())["node"]["metadata"]["content_metadata"]["page_number"]) for node in retriever.retrieve(query)
    ]

    for k in [1, 3, 5, 10]:
        hits[k].append(expected_answer in retrieved_answers[:k])

for k in hits:
    print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')