In [56]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
# milvus_client.drop_collection(collection_name='earnings')

In [None]:
from nv_ingest_client.client import Ingestor

ingestor = (
    Ingestor(message_client_hostname="localhost")
    .files("data/earnings_consulting/*/*.pdf")
    .extract(
        extract_text=True,
        extract_tables=True,
        extract_charts=True,
        extract_infographics=True,
        extract_images=True,
        text_depth="page",
    ).embed()
)

results = ingestor.ingest()

In [13]:
len(results)

514

In [None]:
# Optional: save results
import pickle

filehandler = open('earnings_results.obj', 'wb')
pickle.dump(results, filehandler)

In [9]:
# Optional: load results
import pickle

filehandler = open('earnings_results.obj', 'rb')
results = pickle.load(filehandler)

In [68]:
from nv_ingest_client.util.milvus import write_to_nvingest_collection, create_nvingest_collection, nvingest_retrieval

sparse = False
milvus_hostname = "localhost"
create_nvingest_collection("earnings", f"http://{milvus_hostname}:19530", sparse=sparse)

Filter empty

In [48]:
results = [[element for element in result if element['metadata']['embedding'] is not None] for result in results]

Filter duplicate PDFs

In [54]:
results = [result for result in results if result[0]['metadata']['source_metadata']['source_id'].split('/')[-1] not in set(['overview-evidence-review-of-data-and-information_1.pdf','corporateventuresinsweden-190228155032_1.pdf'])]

In [69]:
write_to_nvingest_collection(results, "earnings", sparse=sparse, milvus_uri=f"http://{milvus_hostname}:19530", minio_endpoint="localhost:9000")

Wrote data to: [['1d95012b-e11a-410f-b6f6-32c07b8592ab/1.parquet']]
Start time: 2025-02-27 11:05:14
Imported row count: 17430
Bulk earnings upload took 47.1400249004364 s


In [101]:
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np

def get_recall_scores(query_df, collection_name):
    hits = defaultdict(list)
    all_answers = nvingest_retrieval(
        query_df["query"].to_list(),
        collection_name,
        hybrid=sparse,
        embedding_endpoint="http://localhost:8012/v1",
        model_name="nvidia/llama-3.2-nv-embedqa-1b-v2",
        top_k=10,
    )

    for i in range(len(query_df)):
        expected_pdf_page = query_df['pdf_page'][i]
        retrieved_answers = all_answers[i]
        retrieved_pdfs = [os.path.basename(result['entity']['source']['source_id']).split('.')[0] for result in retrieved_answers]
        retrieved_pages = [str(result['entity']['content_metadata']['page_number']) for result in retrieved_answers]
        retrieved_pdf_pages = [f"{pdf}_{page}" for pdf, page in zip(retrieved_pdfs, retrieved_pages)]

        # print(expected_pdf_page)
        # print(retrieved_pdf_pages)

        for k in [1, 5, 10]:
            hits[k].append(expected_pdf_page in retrieved_pdf_pages[:k])
    
    for k in hits:
        print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')

In [71]:
import pandas as pd

df_query = pd.read_csv('../data/earnings_consulting_multimodal.csv')
df_query['pdf_page'] = df_query.apply(lambda x:f"{x.pdf}_{x.page}", axis=1)
df_query.head()

Unnamed: 0,dir,pdf,page,query,answer,modality,xywh,pdf_page
0,amazon_earnings_call,004e15aa-5d50-4fb3-9e2f-0ad36639778f,7,How many shares of Rivian’s Class A stock are ...,158 million,text,,004e15aa-5d50-4fb3-9e2f-0ad36639778f_7
1,amazon_earnings_call,004e15aa-5d50-4fb3-9e2f-0ad36639778f,9,What is fair value?,the price that would be received to sell an as...,text,,004e15aa-5d50-4fb3-9e2f-0ad36639778f_9
2,amazon_earnings_call,004e15aa-5d50-4fb3-9e2f-0ad36639778f,33,What are the the key business and industry cha...,"we have many competitors across geographies, i...",text,,004e15aa-5d50-4fb3-9e2f-0ad36639778f_33
3,amazon_earnings_call,06c52e44-38c4-42b0-90c4-14916435e83e,22,What was the percentage increase in Amazon's s...,15%,text,,06c52e44-38c4-42b0-90c4-14916435e83e_22
4,amazon_earnings_call,06c52e44-38c4-42b0-90c4-14916435e83e,30,What risk do we face with foreign exchange rat...,"foreign exchange rates vary, net sales and oth...",text,,06c52e44-38c4-42b0-90c4-14916435e83e_30


In [72]:
df_query.modality.value_counts()

modality
text           242
table          157
chart          129
infographic    100
Name: count, dtype: int64

In [73]:
get_recall_scores(df_query, "earnings")

  - Recall @1: 0.390
  - Recall @5: 0.619
  - Recall @10: 0.736


In [103]:
for modality in df_query.modality.unique():
    print(modality)
    get_recall_scores(df_query.query(f"modality=='{modality}'").reset_index(drop=True), "earnings")

text
  - Recall @1: 0.194
  - Recall @5: 0.517
  - Recall @10: 0.678
table
  - Recall @1: 0.255
  - Recall @5: 0.465
  - Recall @10: 0.592
chart
  - Recall @1: 0.628
  - Recall @5: 0.791
  - Recall @10: 0.868
infographic
  - Recall @1: 0.770
  - Recall @5: 0.890
  - Recall @10: 0.930
