In [None]:
#!pip install llama_parse
#!pip install llama_index

import random
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from config import OPENAI_API_KEY, LLAMAPARSE_API_KEY

# Enable nested async loops
import nest_asyncio
nest_asyncio.apply()


In [2]:
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown"  # "markdown" and "text" are available
)

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['./data/3M_2018_10K.pdf'], file_extractor=file_extractor).load_data()
print("pdf file pages:", len(documents))

Started parsing the file under job_id 312718ff-1b5d-434d-aafe-1aacd9b0c553
pdf file pages: 160


In [3]:
#split into chunks (by tokens)
import os
import sys
import json
from helper import file_get_contents, file_put_contents, generate_contextual_keywords, get_llm_answer, generate_questions_bychunk
from llama_index.core.schema import Document
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

def split_into_chunks(content, chunk_size):
	a = enc.encode(content)
	left, chunks = 0, []
	while left < len(a):
		arr = a[left : left+chunk_size]
		chunks.append(enc.decode(arr))
		left+=chunk_size
	return chunks
    
def generate_chunked_content(chunks):
    chunked_content = ""
    for idx, text in enumerate(chunks):
      chunked_content+=f"### Chunk {idx+1} ###\n{text}\n\n"
    return chunked_content
    

#generate contextual keywords
path = "./temp/chunks2.json"
if not os.path.exists(path):
    print("Generating keywords..")
    document_content, chunks, chunks2 = "", [], []
    for doc in documents: document_content+=doc.text+"\n"
    chunks1 = split_into_chunks(document_content, 200) #400 -- defaulf value
    for i, chunk in enumerate(chunks1):
        chunks.append(chunk)
        if (len(chunks) > 10 or (i==len(chunks1)-1) and len(chunks)>2):
            chunked_content = generate_chunked_content(chunks)
            keywords = generate_contextual_keywords(chunked_content)        
            print("page_end:", i+1, keywords, len(keywords), len(chunks))            
            assert len(keywords) >= len(chunks)
            for j in range(len(chunks)): chunks2.append( {"idx":j, "keywords":keywords[j], "content":chunks[j]} )
            chunks = []
    file_put_contents(path, json.dumps(chunks2))
else:
    chunks2 = json.loads(file_get_contents(path)) #it has content, keywords, idx


#generate questions
path = "./temp/chunks3.json"
if not os.path.exists(path):
    print("Generating questions..")
    chunks3 = generate_questions_bychunk(chunks2) 
    file_put_contents(path, json.dumps(chunks3))
else:
    chunks3 = json.loads(file_get_contents(path)) #it has content, keywords, questions, idx now


Generating keywords..
Keywords_st:
 Here are the keywords for each chunk:

### Chunk 1 ###
SEC Filings, 3M Company, Annual Report, Form 10-K, Securities Exchange Act of 1934

### Chunk 2 ###
3M Company, Securities Registered, New York Stock Exchange, SWX Swiss Exchange, Employer Identification Number

### Chunk 3 ###
SEC Filings, 3M Company, Reporting Requirements, Filing Status, Interactive Data Files

### Chunk 4 ###
3M Company, Filing Status, Large Accelerated Filer, Financial Accounting Standards, Shell Company

### Chunk 5 ###
3M Company, Market Value, Voting Stock, Shares Outstanding, Financial Data

### Chunk 6 ###
Form 10-K, Table of Contents, Item 1 Business, Risk Factors, Financial Condition

### Chunk 7 ###
3M Company, Financial Performance, Business Segments, Geographic Areas, Critical Accounting Estimates

### Chunk 8 ###
Financial Statements, Consolidated Statement of Income, Balance Sheet, Cash Flows

### Chunk 9 ###
Notes to Consolidated Financial Statements, Accounting

In [None]:
#create Index    
from llama_index.core import GPTVectorStoreIndex, StorageContext, load_index_from_storage
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
INDEX_DIR = "./temp/local_index_cache"
if not os.path.exists(INDEX_DIR):
    print("Creating new index ...")
    documents2 = [Document(text=x['conten hnujmilok,-.h0p  0n  bn89jh9muyp0,ikvbjgbt'], metadata={"id": str(x["idx"])}) for x in chunks3] 
    index = GPTVectorStoreIndex.from_documents(documents2)
    index.storage_context.persist(persist_dir=INDEX_DIR)
else:
    storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
    index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(similarity_top_k=5)

# run tests
count, correct = 0, 0
for test in chunks3[:]:
    if not "questions" in test: continue
    idx = test["idx"]
    for question in test["questions"]:
        count+=1
        response = query_engine.query(question)
        print("\n\n--- Test:", question, "idx:", idx)
        for result in response.source_nodes[:]:
            print(result.node.metadata) #prompt+=f"\n\n<Document>\n                     
            if result.node.metadata['id'] == str(idx): correct+=1                

print("Test correct, all:", correct, count)