# Evaluate earnings consulting retrieval recall accuracy with NV-Ingest and Milvus

In this notebook, we'll use NV-ingest to get the end-to-end recall accuracy of a retrieval pipeline made up of NV-Ingest's extraction and embedding tasks and a Milvus vector database (VDB).

In [89]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
# milvus_client.drop_collection(collection_name='earnings')

In [3]:
from nv_ingest_client.client import Ingestor

ingestor = (
    Ingestor(message_client_hostname="localhost")
    .files("../data/earnings_consulting/*/*.pdf")
    .extract(
        extract_text=True,
        extract_tables=True,
        extract_charts=True,
        extract_infographics=True,
        extract_images=False,
        text_depth="page",
    ).embed()
)

results = ingestor.ingest(show_progress=True)

In [5]:
len(results)

514

In [4]:
# Optional: save results
import pickle

with open('earnings_results.obj', 'wb') as f:
    pickle.dump(results, f)

In [13]:
# Optional: load results
import pickle

with open('earnings_results.obj', 'rb') as f:
    results = pickle.load(f)

Filter empty

In [14]:
results = [[element for element in result if element['metadata']['embedding'] is not None] for result in results]

Filter duplicate PDFs

In [15]:
results = [result for result in results if result[0]['metadata']['source_metadata']['source_id'].split('/')[-1] not in set(['overview-evidence-review-of-data-and-information_1.pdf','corporateventuresinsweden-190228155032_1.pdf'])]

In [10]:
from nv_ingest_client.util.milvus import write_to_nvingest_collection, create_nvingest_collection, nvingest_retrieval

sparse = False
milvus_hostname = "localhost"
create_nvingest_collection("earnings", f"http://{milvus_hostname}:19530", sparse=sparse)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /raid/cjarrett/miniconda3/envs/nv-ingest-
[nltk_data]     dev-4/lib/python3.10/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
write_to_nvingest_collection(results, "earnings", sparse=sparse, milvus_uri=f"http://{milvus_hostname}:19530", minio_endpoint="localhost:9000")

In [1]:
import pandas as pd

df_query = pd.read_csv('../data/earnings_consulting_multimodal.csv')
df_query['pdf_page'] = df_query.apply(lambda x:f"{x.pdf}_{x.page}", axis=1)
df_query['query_id'] = range(len(df_query))
df_query.head()

Unnamed: 0,dir,pdf,page,query,answer,modality,xywh,pdf_page,query_id
0,amazon_earnings_call,004e15aa-5d50-4fb3-9e2f-0ad36639778f,7,How many shares of Rivian’s Class A stock are ...,158 million,text,,004e15aa-5d50-4fb3-9e2f-0ad36639778f_7,0
1,amazon_earnings_call,004e15aa-5d50-4fb3-9e2f-0ad36639778f,9,What is fair value?,the price that would be received to sell an as...,text,,004e15aa-5d50-4fb3-9e2f-0ad36639778f_9,1
2,amazon_earnings_call,004e15aa-5d50-4fb3-9e2f-0ad36639778f,33,What are the the key business and industry cha...,"we have many competitors across geographies, i...",text,,004e15aa-5d50-4fb3-9e2f-0ad36639778f_33,2
3,amazon_earnings_call,06c52e44-38c4-42b0-90c4-14916435e83e,22,What was the percentage increase in Amazon's s...,15%,text,,06c52e44-38c4-42b0-90c4-14916435e83e_22,3
4,amazon_earnings_call,06c52e44-38c4-42b0-90c4-14916435e83e,30,What risk do we face with foreign exchange rat...,"foreign exchange rates vary, net sales and oth...",text,,06c52e44-38c4-42b0-90c4-14916435e83e_30,4


In [2]:
df_query.modality.value_counts()

modality
text           242
table          157
chart          129
infographic    100
Name: count, dtype: int64

In [3]:
from recall_utils import get_recall_scores

results = get_recall_scores(df_query, "earnings", sparse)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /raid/cjarrett/miniconda3/envs/nv-ingest-
[nltk_data]     dev-4/lib/python3.10/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


  - Recall @1: 0.387
  - Recall @5: 0.629
  - Recall @10: 0.734


In [4]:
results

Unnamed: 0,query_id,query_text,expected_pdf_page,expected_pdf_text,retrieved_pdf_pages,retrieved_pdf_texts
0,0,How many shares of Rivian’s Class A stock are ...,004e15aa-5d50-4fb3-9e2f-0ad36639778f_7,158 million,"[c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96_45, d2fd...",[Table of Contents\r\nIncluded in other income...
1,1,What is fair value?,004e15aa-5d50-4fb3-9e2f-0ad36639778f_9,the price that would be received to sell an as...,"[_10-Q-Q1-2021-As-Filed_13, 2022-alphabet-annu...",[Fair Value Hedges\r\nFair value hedge gains a...
2,2,What are the the key business and industry cha...,004e15aa-5d50-4fb3-9e2f-0ad36639778f_33,"we have many competitors across geographies, i...","[64aef25f-17ea-4c46-985a-814cb89f6182_41, 004e...",[Table of Contents\r\n• fluctuations in the st...
3,3,What was the percentage increase in Amazon's s...,06c52e44-38c4-42b0-90c4-14916435e83e_22,15%,"[Q3-2021-Earnings-Release_0, Q3-2022-Amazon-Ea...",[AMAZON.COM ANNOUNCES THIRD QUARTER RESULTS\r\...
4,4,What risk do we face with foreign exchange rat...,06c52e44-38c4-42b0-90c4-14916435e83e_30,"foreign exchange rates vary, net sales and oth...","[20201030-alphabet-10q_51, 56f9e4fc-22e2-46e8-...",[Foreign Currency Exchange Risk\r\nWe transact...
...,...,...,...,...,...,...
623,623,"Per Nvidia's Oct 2023 investor presentation, w...",ndr_presentation_oct_2023_final_13,$2.5B,"[NVDA-Company-Overview-2024-02-21_14, nvda-f3q...",[The High ROI of High Compute Performance\r\nR...
624,624,How did NVIDIA's diluted EPS change in Q1 FY24...,nvda-f1q24-investor-presentation-final_5,28%,"[nvda-f3q24-investor-presentation-final_5, nvd...",[Q3 FY24 Financial Summary Q3 FY23 - Q4 FY23...
625,625,What chips has NVIDIA produced?,nvda-f2q24-investor-presentation-final_31,GPU CPU DPU,"[NVDA-F2Q23-Investor-Presentation-FINAL-1_23, ...",[NVIDIA At a Glance\r\nAccelerated computing p...
626,626,What was NVIDIA's year-over-year percentage in...,nvda-f3q24-investor-presentation-final_9,4%,"[nvda-f3q24-investor-presentation-final_9, nvd...",[$251\r\n$294 $296\r\n$253\r\n$261\r\nQ3 FY23 ...


In [86]:
for modality in df_query.modality.unique():
    print(modality)
    get_recall_scores(df_query.query(f"modality=='{modality}'").reset_index(drop=True), "earnings", sparse)

text
  - Recall @1: 0.198
  - Recall @5: 0.521
  - Recall @10: 0.682
table
  - Recall @1: 0.248
  - Recall @5: 0.465
  - Recall @10: 0.573
chart
  - Recall @1: 0.612
  - Recall @5: 0.806
  - Recall @10: 0.860
infographic
  - Recall @1: 0.760
  - Recall @5: 0.910
  - Recall @10: 0.940
