In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
from tqdm import tqdm
import os
import random
import json
import pandas as pd
import hashlib
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor

from datasets import load_dataset

ds = load_dataset("virattt/financial-qa-10K")

client = OpenAI()
API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
ds['train'][20]

{'question': 'How does the company support multi-billion-dollar end markets with their technology?',
 'answer': 'The company supports multi-billion-dollar end markets by using a variety of software stacks developed either internally or by third-party developers and partners, utilizing a shared underlying technology across all these markets.',
 'context': 'investments in research and development: we can support several multi-billion-dollar end markets with shared underlying technology by using a variety of software stacks developed either internally or by third-party developers and partners. We utilize this platform approach in each of our target markets.',
 'ticker': 'NVDA',
 'filing': '2023_10K'}

In [3]:
def generate_document_id(doc):
    combined = f"{doc['question']}-{doc['context'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
documents = []

for doc in ds['train']:
    doc_id = generate_document_id(doc)
    doc_final = {'question': doc['question'],
                 'context': doc['answer'],
                 'company_id': doc['ticker'],
                 'doc_id': doc_id
                 }

    documents.append(doc_final)

In [5]:
documents[20]

{'question': 'How does the company support multi-billion-dollar end markets with their technology?',
 'context': 'The company supports multi-billion-dollar end markets by using a variety of software stacks developed either internally or by third-party developers and partners, utilizing a shared underlying technology across all these markets.',
 'company_id': 'NVDA',
 'doc_id': '3a0e16eb'}

In [6]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

## GENERATING GROUND TRUTH DATA

In [7]:
prompt_template = """
You emulate someone who is asking financial questions. 
Based on the following original question and the specified company financial quote, generate 3 different but similar questions that a user might ask about the same topic.
Questions generated should contain company original name not quote.

ORIGINAL QUESTION: {question} 
COMPANY QUOTE: {company_id} 

Provide the output in parsable JSON format without using code blocks. Below is the right output format:

["question1", "question2", "question3"]
""".strip()

In [8]:
pool = ThreadPoolExecutor(max_workers=4)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [9]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-3.5-turbo-1106',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [10]:
generate_questions(documents[20])

'[\n  "How does NVIDIA support multi-billion-dollar end markets with their technology?",\n  "What strategies does NVIDIA use to cater to multi-billion-dollar end markets with their technology?",\n  "In what ways does NVIDIA\'s technology benefit the multi-billion-dollar end markets they serve?"\n]'

In [11]:
def process_record(rec):
    response = generate_questions(rec)
    
    return response

In [12]:
ground_truth = map_progress(pool, documents, process_record)

100%|██████████| 7000/7000 [24:56<00:00,  4.68it/s]


In [13]:
ground_truth[20]

'[\n  "How does NVIDIA support multi-billion-dollar end markets with their technology?",\n  "What strategies has NVIDIA implemented to cater to multi-billion-dollar end markets?",\n  "In what ways does NVIDIA\'s technology contribute to the support of multi-billion-dollar end markets?"\n]'

In [14]:
final_results = []

for i, questions in enumerate(ground_truth):
    company_id = documents[i]['company_id']
    doc_id = documents[i]['doc_id']
    
    try:
        question_list = json.loads(questions)

        # if list
        if isinstance(question_list, list):
            for q in question_list:
                if isinstance(q,str) :
                    final_results.append((q, company_id, doc_id))
                else:
                    q = list(q.values())
                    final_results.append((q[0], company_id, doc_id))
        
        # if dictionary
        else:
            for key, q in question_list.items():
                final_results.append((q, company_id, doc_id))
    
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError for document {doc_id}: {e}")
        continue

JSONDecodeError for document 7120441d: Invalid control character at: line 2 column 87 (char 88)
JSONDecodeError for document e84c4698: Invalid control character at: line 2 column 62 (char 63)
JSONDecodeError for document 5c34ee36: Expecting ',' delimiter: line 3 column 1 (char 97)
JSONDecodeError for document 1fcddfae: Invalid control character at: line 2 column 142 (char 143)
JSONDecodeError for document e8349985: Expecting ',' delimiter: line 3 column 1 (char 95)
JSONDecodeError for document d61916b9: Expecting ',' delimiter: line 3 column 2 (char 109)
JSONDecodeError for document 898f47e3: Invalid control character at: line 2 column 103 (char 104)
JSONDecodeError for document 275f3aab: Invalid control character at: line 2 column 103 (char 104)
JSONDecodeError for document 58a62b30: Expecting ',' delimiter: line 3 column 1 (char 112)
JSONDecodeError for document 0f2eb85b: Invalid control character at: line 2 column 68 (char 69)
JSONDecodeError for document b03e0570: Expecting ',' del

In [19]:
len(final_results)

20522

In [15]:
final_results[0:5]

[("What is NVIDIA's primary focus before expanding into other computationally intensive fields?",
  'NVDA',
  '30179b8e'),
 ("Can you explain NVIDIA's initial area of focus before diversifying into other computationally intensive sectors?",
  'NVDA',
  '30179b8e'),
 ("Before branching out into other computationally intensive areas, what was NVIDIA's main area of focus?",
  'NVDA',
  '30179b8e'),
 ('What impact has NVDA seen from the recent applications of GPU-powered deep learning?',
  'NVDA',
  'bbc176db'),
 ('Can NVDA provide more information on the specific industries or fields that have implemented GPU-powered deep learning?',
  'NVDA',
  'bbc176db')]

In [17]:
df = pd.DataFrame(final_results, columns=['question', 'company_id', 'document_id'])
df.to_csv('ground-truth-data.csv', index=False)

In [18]:
!head ground-truth-data.csv

question,company_id,document_id
What is NVIDIA's primary focus before expanding into other computationally intensive fields?,NVDA,30179b8e
Can you explain NVIDIA's initial area of focus before diversifying into other computationally intensive sectors?,NVDA,30179b8e
"Before branching out into other computationally intensive areas, what was NVIDIA's main area of focus?",NVDA,30179b8e
What impact has NVDA seen from the recent applications of GPU-powered deep learning?,NVDA,bbc176db
Can NVDA provide more information on the specific industries or fields that have implemented GPU-powered deep learning?,NVDA,bbc176db
How do the recent applications of GPU-powered deep learning by NVDA contribute to their overall financial performance?,NVDA,bbc176db
What is NVIDIA's current revenue and growth rate?,NVDA,c560a6ad
Can you explain NVIDIA's debt-to-equity ratio and how it impacts the company?,NVDA,c560a6ad
What are NVIDIA's plans for future capital expenditures and how will it affect their financia