In [1]:
import random
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from config import OPENAI_API_KEY, LLAMAPARSE_API_KEY

# Enable nested async loops
import nest_asyncio
nest_asyncio.apply()


In [2]:
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown"  # "markdown" and "text" are available
)

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['./data/3M_2018_10K.pdf'], file_extractor=file_extractor).load_data()
print("pdf file pages:", len(documents))

Started parsing the file under job_id 56b65a50-1400-4e58-a76d-6780185a01fd
pdf file pages: 160


In [3]:
#split into chunks (by tokens)
import os
import sys
import json
from helper import file_get_contents, file_put_contents, generate_contextual_keywords, get_llm_answer, generate_questions_bychunk
from llama_index.core.schema import Document
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

def split_into_chunks(content, chunk_size):
	a = enc.encode(content)
	left, chunks = 0, []
	while left < len(a):
		arr = a[left : left+chunk_size]
		chunks.append(enc.decode(arr))
		left+=chunk_size
	return chunks
    
def generate_chunked_content(chunks):
    chunked_content = ""
    for idx, text in enumerate(chunks):
      chunked_content+=f"### Chunk {idx+1} ###\n{text}\n\n"
    return chunked_content
    

#generate contextual keywords
path = "./temp/chunks2.json"
if not os.path.exists(path):
    print("Generating keywords..")
    document_content, chunks, chunks2 = "", [], []
    for doc in documents: document_content+=doc.text+"\n"
    chunks1 = split_into_chunks(document_content, 400)    
    for i, chunk in enumerate(chunks1):        
        chunks.append(chunk)
        if (len(chunks) > 10 or (i==len(chunks1)-1) and len(chunks)>2):
            chunked_content = generate_chunked_content(chunks)
            keywords = generate_contextual_keywords(chunked_content)        
            print("page_end:", i+1, keywords, len(keywords), len(chunks))            
            assert len(keywords) >= len(chunks)
            for j in range(len(chunks)): chunks2.append( {"keywords":keywords[j], "content":chunks[j]} )
            chunks = []
    file_put_contents(path, json.dumps(chunks2))
else:
    chunks2 = json.loads(file_get_contents(path)) #it has content, keywords


#generate questions
path = "./temp/chunks3.json"
if not os.path.exists(path):
    print("Generating questions..")
    chunks3 = generate_questions_bychunk(chunks2) 
    file_put_contents(path, json.dumps(chunks3))
else:
    chunks3 = json.loads(file_get_contents(path)) #it has content, keywords, questions, idx now


Generating questions..


In [10]:
#
for i, chunk in enumerate(chunks3):
    if not "idx" in chunk: chunk["idx"] = i
    elif chunk["idx"]!=i: print(chunk["idx"], i, "incorrect indexing?")
        
#create Index    
from llama_index.core import GPTVectorStoreIndex, StorageContext, load_index_from_storage
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
INDEX_DIR = "./temp/local_index_cache"
if not os.path.exists(INDEX_DIR):
    print("Creating new index ...")
    documents2 = [Document(text=x['content'], metadata={"id": str(x["idx"])}) for x in chunks3] # "#"+", ".join(x["keywords"])+"\n"+x["content"]
    index = GPTVectorStoreIndex.from_documents(documents2)
    index.storage_context.persist(persist_dir=INDEX_DIR)
else:
    storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
    index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(similarity_top_k=5)

# run tests
count, correct = 0, 0
for test in chunks3[:]:
    if not "questions" in test: continue
    idx = test["idx"]
    for question in test["questions"]:
        count+=1
        response = query_engine.query(question)
        print("\n\n--- Test:", question, "idx:", idx)
        for result in response.source_nodes[:]:
            print(result.node.metadata) #prompt+=f"\n\n<Document>\n                     
            if result.node.metadata['id'] == str(idx): correct+=1                

print("Test correct, all:", correct, count)

Creating new index ...


--- Test: When was 3M Company incorporated? idx: 4
{'id': '305'}
{'id': '307'}
{'id': '0'}
{'id': '12'}
{'id': '237'}


--- Test: Where can the public obtain documents filed by 3M Company with the SEC? idx: 4
{'id': '4'}
{'id': '0'}
{'id': '1'}
{'id': '265'}
{'id': '273'}


--- Test: What is the ticker symbol of 3M Company? idx: 4
{'id': '0'}
{'id': '19'}
{'id': '305'}
{'id': '307'}
{'id': '4'}


--- Test: How many business segments does 3M manage its operations in? idx: 5
{'id': '5'}
{'id': '8'}
{'id': '37'}
{'id': '256'}
{'id': '260'}


--- Test: What is the approximate number of people employed by 3M as of December 31, 2018? idx: 5
{'id': '72'}
{'id': '54'}
{'id': '66'}
{'id': '68'}
{'id': '63'}


--- Test: What are some of the markets served by 3M's Industrial segment? idx: 5
{'id': '5'}
{'id': '6'}
{'id': '8'}
{'id': '7'}
{'id': '10'}


--- Test: What types of products does 3M's Health Care segment provide to medical clinics and hospitals? idx: 8
{'id': '8