In [7]:
import requests
from bs4 import BeautifulSoup

def get_html_text(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    else:
        return None

def save_text_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

def process_links(links, headers):
    for idx, link in enumerate(links, start=1):
        html_text = get_html_text(link, headers)
        if html_text:
            filename = f"{idx}_{link.split('/')[-1]}"  # Generate a unique filename
            save_text_to_file(html_text, filename)
            print(f"Text extracted from link {idx} saved to '{filename}'")
        else:
            print(f"Failed to extract text from link {idx}")

# List of links to process
links = [
    'https://www.sec.gov/Archives/edgar/data/320193/000119312520225672/d937363dex41.htm',
    'https://www.sec.gov/Archives/edgar/data/320193/000119312520225672/d937363dex41.htm',
    'https://www.sec.gov/Archives/edgar/data/320193/000119312520225672/d937363dex41.htm'
    # Add more links as needed
]

# Custom user agent
headers = {'User-Agent': 'Julia Susse'}

# Process the list of links
process_links(links, headers)


Text extracted from link 1 saved to '1_d937363dex41.htm'
Text extracted from link 2 saved to '2_d937363dex41.htm'
Text extracted from link 3 saved to '3_d937363dex41.htm'


In [22]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import BSHTMLLoader
import chromadb
from dotenv import load_dotenv
from os import getenv
import os

load_dotenv("../../../.env")
my_key = getenv("OPENAI_API_KEY")






In [23]:
html_files = os.listdir("data")
html_files = ["data/"+ file for file in html_files]
print(len(html_files))

3


In [24]:
documents = []
for file in html_files:
    loader = BSHTMLLoader(html_files[0])
    data = loader.load()
    documents += data

In [25]:
chroma_name = "./../../output/chroma_db"
persistent_client = chromadb.PersistentClient(chroma_name)
persistent_client.delete_collection("chroma.sqlite3")
db = persistent_client.get_or_create_collection("chroma.sqlite3")
print(len(db.get()["ids"]))

0


In [30]:
text_splitter = CharacterTextSplitter(chunk_size=4000, chunk_overlap=2000)


# Split documents into batches
batch_size = 100  # Set the batch size
batches = [documents[i:i+batch_size] for i in range(0, len(documents), batch_size)]

# Process batches synchronously
for batch_num, batch in enumerate(batches):
    print(f"Processing batch {batch_num}...")
    text_split_batch = text_splitter.split_documents(batch)
    # Initialize Chroma database and add documents to it
    db = Chroma.from_documents(text_split_batch, OpenAIEmbeddings(api_key=my_key), persist_directory=chroma_name)
    print(f"Batch {batch_num} processed.")

# print("All batches processed.")



Created a chunk of size 4006, which is longer than the specified 4000
Created a chunk of size 4006, which is longer than the specified 4000
Created a chunk of size 4006, which is longer than the specified 4000
Created a chunk of size 4006, which is longer than the specified 4000
Created a chunk of size 4006, which is longer than the specified 4000
Created a chunk of size 4006, which is longer than the specified 4000


Processing batch 0...
Batch 0 processed.


In [31]:
query = "apple"
docs = db.similarity_search(query)
docs

[Document(page_content='APPLE INC. \n2.400% Note due 2050  \xa0\n\n\n No.\n\xa0\xa0\nCUSIP No.: 037833 DZ0\n\n\n\xa0\xa0\nISIN No.: US037833DZ01\n\n\xa0\xa0\n$[●]\n APPLE INC., a California corporation (the “Issuer”), for value received promises to pay\nto CEDE\xa0& CO. or registered assigns the principal sum listed on the Schedule of Exchanges of Notes on August\xa020, 2050. \nInterest Payment Dates: February\xa020 and August\xa020, beginning on February\xa020, 2021 and on the maturity date (each, an\n“Interest Payment Date”).  Interest Record Dates: February\xa06 and August\xa06 (each, an “Interest\nRecord Date”).  Reference is made to the further provisions of this Note contained herein, which will for all purposes have\nthe same effect as if set forth at this place. \n\n\nIN WITNESS WHEREOF, the Issuer has caused this Note to be signed manually or by\nfacsimile by its duly authorized officer.  \xa0\n\n\n APPLE INC.\n\n\n By:\n\xa0\n \xa0\n\n\n\xa0\n Name: Gary Wipfler\n\n\n\xa0\n T

In [32]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(model_name="gpt-4", temperature=0.7, max_tokens=2000)
retriever = db.as_retriever()
# prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)




prompt = ChatPromptTemplate.from_template(
    "This is an example briefing: {context}.\n\n {question}."
)

print(prompt)

# Configure the Rag (Retrieval-Augmented Generation) Chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



prompt = f""
print(rag_chain.invoke(prompt))




input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='This is an example briefing: {context}.\n\n {question}.'))]
THIS INDENTURE, made this ______ day of ________, 20______, between APPLE INC., a corporation organized and existing under the laws of the State of California (hereinafter called the "Company"), and _________________, a ___________ corporation (hereinafter called the "Trustee"). 

WITNESSETH:

WHEREAS, the Company has duly authorized the creation of an issue of its 1.250% Note due 2030 (hereinafter called the "Notes");

AND WHEREAS, to provide for the authentication and delivery of the Notes, the Company has duly authorized the execution and delivery of this Indenture;

AND WHEREAS, all things necessary to make the Notes, when executed by the Company and authenticated and delivered hereunder, the valid obligations of the Company, and to make this Indenture a valid agreement of th

In [33]:

print(prompt)
response = llm.invoke(prompt)
print(response)

As a lawyer representing Apple Inc., I am working on a indenture. Make one for me. Use 1000 words.
content='**INDENTURE AGREEMENT**\n\nTHIS INDENTURE, made this (Insert Date), by and between APPLE INC., a California corporation, hereinafter referred to as "Company", with its principal place of business located at One Apple Park Way, Cupertino, CA, USA, and (Insert Second Party\'s Name), a (Insert Second Party\'s Location and Type of Entity), hereinafter referred to as "Trustee", with its principal place of business located at (Trustee\'s Address).\n\nWITNESSETH:\n\nWHEREAS, the Company desires to secure the payment of certain of its obligations by granting to the Trustee a security interest in certain of its assets;\n\nNOW, THEREFORE, in consideration of the premises and the mutual covenants herein contained, the parties hereto agree as follows:\n\n1. **Definitions and Interpretation:** Any capitalized term used in this Indenture shall, unless otherwise defined herein, have the meaning

In [36]:
print(response.content)

**INDENTURE AGREEMENT**

THIS INDENTURE, made this (Insert Date), by and between APPLE INC., a California corporation, hereinafter referred to as "Company", with its principal place of business located at One Apple Park Way, Cupertino, CA, USA, and (Insert Second Party's Name), a (Insert Second Party's Location and Type of Entity), hereinafter referred to as "Trustee", with its principal place of business located at (Trustee's Address).

WITNESSETH:

WHEREAS, the Company desires to secure the payment of certain of its obligations by granting to the Trustee a security interest in certain of its assets;

NOW, THEREFORE, in consideration of the premises and the mutual covenants herein contained, the parties hereto agree as follows:

1. **Definitions and Interpretation:** Any capitalized term used in this Indenture shall, unless otherwise defined herein, have the meaning assigned to it in the Agreement.

2. **Grant of Security:** The Company, in order to secure the punctual payment of the 

In [40]:
import requests
from bs4 import BeautifulSoup

# URL of the SEC filing page
url = 'https://www.sec.gov/Archives/edgar/data/320193/000119312519242975/0001193125-19-242975-index.html'

# Send a GET request to the URL
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <a> tags containing the text 'Exhibit 4.1'
    exhibit_links = soup.find_all('a', string='Exhibit 4.1')

    # Check if any links were found
    if exhibit_links:
        # Get the href attribute of the first link found
        exhibit_link = exhibit_links[0]['href']
        print("Link to Exhibit 4.1:", exhibit_link)
    else:
        print("Exhibit 4.1 link not found.")
else:
    print("Error fetching URL:", response.status_code)


Exhibit 4.1 link not found.


In [41]:
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="Wed, 11 Sep 2019 20:21:21 GMT" http-equiv="Last-Modified"/>
<title>EDGAR Filing Documents for 0001193125-19-242975</title>
<link href="/include/interactive.css" rel="stylesheet" type="text/css"/>
</head>
<body style="margin: 0">
<!-- SEC Web Analytics - For information please visit: https://www.sec.gov/privacy.htm#collectedinfo -->
<noscript><iframe height="0" src="//www.googletagmanager.com/ns.html?id=GTM-TD3BKV" style="display:none;visibility:hidden" width="0"></iframe></noscript>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.inse

In [42]:
pip install sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.1.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.6.0-py3-none-any.whl.metadata (24 kB)
Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.6.0-py3-none-any.whl (26 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.6.0 sec-edgar-downloader-5.0.2
Note: you may need to restart the kernel to use updated packages.


In [44]:
from sec_edgar_downloader import Downloader

# Download filings to the current working directory
dl = Downloader("MyCompanyName", "my.email@domain.com", "/path/to/save/location")

# Get all 10-K filings for Microsoft without the filing details
dl.get("10-K", "MSFT", download_details=False)

# Get the latest supported filings, if available, for Apple
for filing_type in dl.supported_filings:
    dl.get(filing_type, "AAPL", limit=1)

# Get the latest supported filings, if available, for a
# specified list of tickers and CIKs
equity_ids = ["AAPL", "MSFT", "0000102909", "V", "FB"]
for equity_id in equity_ids:
    for filing_type in dl.supported_filings:
        dl.get(filing_type, equity_id, limit=1)

210

In [47]:
dl.supported_filings

AttributeError: 'Downloader' object has no attribute 'supported_filings'