In [None]:

import os

import llama_index
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter
from bs4 import BeautifulSoup


 

In [None]:
edgar_fold = 'sec-edgar-filings'
form_type = '10-K' #dl.supported_forms
tickers = ['AAPL',] # 'MSFT','JPM','ADSK','IBM'] # dl.ticker_to_cik_mapping
after_date = '2020-01-01' #

In [None]:
SYSTEM_MESSAGE = """
You are an expert financial analyst that always answers questions with the most relevant information using the tools at your disposal.
These tools have information regarding companies that the user has expressed interest in.
Here are some guidelines that you must follow:
* For financial questions, you must use the tools to find the answer and then write a response.
* Even if it seems like your tools won't be able to answer the question, you must still use them to find the most relevant information and insights. Not using them will appear as if you are not doing your job.
* You may assume that the users financial questions are related to the documents they've selected.
* For any user message that isn't related to financial analysis, respectfully decline to respond and suggest that the user ask a relevant question.
* If your tools are unable to find an answer, you should say that you haven't found an answer but still relay any useful information the tools found.

The tools at your disposal have access to the following SEC documents that the user has selected to discuss with you:
{doc_titles}

The current date is: {curr_date}
""".strip()

NODE_PARSER_CHUNK_SIZE = 512
NODE_PARSER_CHUNK_OVERLAP = 10

doc_titles = 'AAPL-10-K-2020'
curr_date = '2024-05-01'

llm_base = OpenAI(temperature=0, model="gpt-3.5-turbo-0125", system_prompt=SYSTEM_MESSAGE.format(doc_titles=doc_titles, curr_date=curr_date))
embed_llm = OpenAIEmbedding(temperature=0, model="text-embedding-3-small")
node_parser = SentenceSplitter.from_defaults(
    chunk_size=NODE_PARSER_CHUNK_SIZE,
    chunk_overlap=NODE_PARSER_CHUNK_OVERLAP,
    # callback_manager=callback_manager,
)


Settings.llm = llm_base
Settings.embed_model = embed_llm
# Settings.node_parser = node_parser


In [None]:
os.listdir(os.path.join(edgar_fold,tickers[0],form_type))

In [None]:
from bs4 import BeautifulSoup


In [None]:
with open(os.path.join(edgar_fold,tickers[0],form_type, 'AAPL-10-K-2020.txt'), 'r') as f:
    contents = f.read()
    tags_to_close = ['TYPE']
    for tag in tags_to_close:
        contents_2 = contents.replace(f'<{tag}>', f'<{tag}>').replace(f'\n<{tag}>', f'</{tag}>\n<{tag}>')

    # Now load the pre-processed content into BeautifulSoup
    soup_2 = BeautifulSoup(contents_2, 'lxml')

In [None]:
soup_2[:1000]

In [None]:
for document in documents:
    type_tag = document.find('type')
    print(type_tag)

In [None]:
with open(os.path.join(edgar_fold,tickers[0],form_type, 'AAPL-10-K-2020.txt'), 'r') as f:
    contents = f.read()
    soup = BeautifulSoup(contents, 'lxml')

In [None]:
documents = soup.find_all('document')

In [None]:
for document in documents:
    type_tag = document.find('type')
    print(type_tag)
    # if type_tag and type_tag.get_text(strip=True).lower() == '10-k':
    #     print("Found a 10-K document:")
    #     filename = document.find('filename').get_text(strip=True) if document.find('filename') else "No filename"
    #     print("Filename:", filename)
    #     text_content = document.find('text').get_text(strip=True) if document.find('text') else "No text content"
    #     print("Content:", text_content)

In [None]:
if type_tag and type_tag.get_text(strip=True).lower() == '10-k':
    print("Found a 10-K document:")

In [None]:
AAPL_docs = SimpleDirectoryReader(input_dir=os.path.join(edgar_fold,tickers[0],form_type)).load_data()


In [None]:
print(f'Loaded AAPL 10-K with {len(AAPL_docs)} pages')


In [None]:
dir(AAPL_docs[0])

In [None]:
AAPL_docs[0].text[:1000]

In [None]:
nodes = node_parser.get_nodes_from_documents(AAPL_docs)


In [None]:
len(nodes[0].text)

In [None]:
len(nodes)

In [None]:
# lyft_index = VectorStoreIndex.from_documents(AAPL_docs)


In [None]:
AAPL_docs

In [None]:
type(AAPL_docs[0])

In [None]:
AAPL_docs[0]