In [1]:
import random
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from config import OPENAI_API_KEY, LLAMAPARSE_API_KEY

# Enable nested async loops
import nest_asyncio
nest_asyncio.apply()


In [None]:
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown"  # "markdown" and "text" are available
)

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['./data/3M_2018_10K.pdf'], file_extractor=file_extractor).load_data()
print(len(documents))

Started parsing the file under job_id ebcb069c-85ab-432f-80d5-79b287b0d949
...........

In [None]:
#split into chunks (by tokens)
import os
import sys
import json
from helper import file_get_contents, file_put_contents, generate_contextual_keywords, get_llm_answer, generate_questions_bychunk
from llama_index.core.schema import Document
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

def split_into_chunks(content, chunk_size):
	a = enc.encode(content)
	left, chunks = 0, []
	while left < len(a):
		arr = a[left : left+chunk_size]
		chunks.append(enc.decode(arr))
		left+=chunk_size
	return chunks
    
def generate_chunked_content(chunks):
    chunked_content = ""
    for idx, text in enumerate(chunks):
      chunked_content+=f"### Chunk {idx+1} ###\n{text}\n\n"
    return chunked_content
    

#generate contextual keywords
if not os.path.exists("./temp/chunks2.json"):
    document_content, chunks2 = "", []
    for i, doc in enumerate(documents):
        document_content+=doc.text+"\n"
        if (len(document_content) > 15000 or i==len(documents)-1):
            chunks = split_into_chunks(document_content, 400)        
            chunked_content = generate_chunked_content(chunks)
            keywords = generate_contextual_keywords(chunked_content)        
            document_content = ""
            print("page_end:", i+1, keywords, len(keywords), len(chunks))
            assert len(keywords) == len(chunks)
            for j in range(len(chunks)):
                chunks2.append( {"keywords":keywords[j], "content":chunks[j]} )
            break    
    file_put_contents("./temp/chunks2.json", json.dumps(chunks2))
else:
    chunks2 = json.loads(file_get_contents("./temp/chunks2.json")) #it has content, keywords


#generate questions
path = "./temp/chunks3.json"
if not os.path.exists(path):
    chunks3 = generate_questions_bychunk(chunks2) 
    file_put_contents(path, json.dumps(chunks3))
else:
    chunks3 = json.loads(file_get_contents(path)) #it has content, keywords, questions, idx now


In [None]:
#create Index
from llama_index.core import GPTVectorStoreIndex, StorageContext, load_index_from_storage
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
INDEX_DIR = "./temp/local_index_cache"
if not os.path.exists(INDEX_DIR):
    print("Creating new index ...")
    documents2 = [Document(text="#"+x["keywords"]+"\n"+x["content"] for x in chunks2]
    index = GPTVectorStoreIndex.from_documents(documents2)
    index.storage_context.persist(persist_dir=INDEX_DIR)
else:
    storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
    index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine()

# run queries from test set
lines = file_get_contents('./data/financebench_open_source.jsonl').split('\n')
tests = [ json.loads(line) for line in lines  if line]
tests = [q for q in tests if q["doc_name"]=="3M_2018_10K"]
print(len(tests))
for test in tests[:1]:
    question, answer, prompt = test["question"], test["answer"], ""
    response = query_engine.query(question)
    print("\n\n--- Test:", question, "\nA:", answer)
    for result in response.source_nodes[:5]: prompt+=f"\n\n<Document>\n{result.node.text}"
    gen_answer = get_llm_answer(prompt, question)
    print("Generated_answer:", gen_answer)        