In [1]:
import requests
import os
import pandas as pd
import yfinance as yf
from bs4 import BeautifulSoup
from io import BytesIO
from markitdown import MarkItDown
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import DocumentStream

In [2]:
html_content = b"<html><body><h1>Hello</h1><p>This is from SEC Form 4</p></body></html>"

### Microsoft MarkItDown

In [3]:
mid = MarkItDown()
result = mid.convert_stream(BytesIO(html_content))
if result:
    print(result.text_content)
else:
    print("Conversion failed.")


# Hello

This is from SEC Form 4


### IBM Docling Converter

In [4]:
html_stream = DocumentStream(name='test', stream=BytesIO(html_content))
converter = DocumentConverter()
result = converter.convert(html_stream)
print(result.document.export_to_markdown())

# Hello

This is from SEC Form 4


### SEC Data

In [5]:
def fetch_company_data_sec():
        USER_AGENT_SEC = os.getenv('USER_AGENT_SEC')
        headers = {'User-Agent': USER_AGENT_SEC}
        company_tickers = requests.get(
            "https://www.sec.gov/files/company_tickers.json",
            headers=headers
        )
        company_data = pd.DataFrame(company_tickers.json()).T
        return company_data


def fetch_company_details_and_filing_accessions(cik):
        USER_AGENT_SEC = os.getenv('USER_AGENT_SEC')
        headers = {'User-Agent': USER_AGENT_SEC}
        filingMetaData = requests.get(
            f'https://data.sec.gov/submissions/CIK{cik}.json',
            headers=headers
        )
        filing_dict = filingMetaData.json()

        important_keys = [
            "name", "tickers", "exchanges", "sicDescription",
            "description", "website", "fiscalYearEnd"
        ]

        secondary_keys = [
            "stateOfIncorporation", "stateOfIncorporationDescription",
            "insiderTransactionForOwnerExists", "insiderTransactionForIssuerExists",
            "category", "addresses"
        ]
        

        # Filter dictionaries
        first_meta_data_dict = {k: (v if v else "N/A") for k, v in filing_dict.items() if k in important_keys}
        secondary_meta_data_dict = {k: (v if v else "N/A") for k, v in filing_dict.items() if k in secondary_keys}
        filings = filing_dict['filings']
        
        return first_meta_data_dict, secondary_meta_data_dict, filings
        
                
def fetch_company_filings(cik, accession, filename):
    USER_AGENT_SEC = os.getenv('USER_AGENT_SEC')
    headers = {'User-Agent': USER_AGENT_SEC}
    url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession}/{filename}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.content
    return data


# st.write('https://www.sec.gov' + data['directory']['name'] + '/' + data['directory']['item'][2]['name'])
#             base_url = 'https://www.sec.gov' + data['directory']['name'] + '/' + data['directory']['item'][2]['name']
#             response_2 = requests.get(base_url, headers=headers)
#             if response_2.status_code == 200:
#                 text = response_2.text
                            

In [6]:
important_forms = ['10-K', '10-Q', '8-K', 'S-1', 'S-3', 'DEF 14A', '20-F', '6-K', '4', '13D', '13G']
cols = ['accessionNumber', 'reportDate', 'form', 'cik']

ciks = fetch_company_data_sec()
ciks['cik_str'] = ciks['cik_str'].astype(str).apply(lambda x: x.zfill(10))
ciks.head()

Unnamed: 0,cik_str,ticker,title
0,320193,AAPL,Apple Inc.
1,1045810,NVDA,NVIDIA CORP
2,789019,MSFT,MICROSOFT CORP
3,1018724,AMZN,AMAZON COM INC
4,1652044,GOOGL,Alphabet Inc.


In [7]:
cik = ciks['cik_str'][0]

  cik = ciks['cik_str'][0]


In [8]:
first_meta, second_meta, filings = fetch_company_details_and_filing_accessions(cik=cik) 

### Metadata first level Company

In [9]:
first_meta

{'sicDescription': 'Electronic Computers',
 'name': 'Apple Inc.',
 'tickers': ['AAPL'],
 'exchanges': ['Nasdaq'],
 'description': 'N/A',
 'website': 'N/A',
 'fiscalYearEnd': '0927'}

### Metadata second level Company

In [10]:
second_meta

{'insiderTransactionForOwnerExists': 'N/A',
 'insiderTransactionForIssuerExists': 1,
 'category': 'Large accelerated filer',
 'stateOfIncorporation': 'CA',
 'stateOfIncorporationDescription': 'CA',
 'addresses': {'mailing': {'street1': 'ONE APPLE PARK WAY',
   'street2': None,
   'city': 'CUPERTINO',
   'stateOrCountry': 'CA',
   'zipCode': '95014',
   'stateOrCountryDescription': 'CA'},
  'business': {'street1': 'ONE APPLE PARK WAY',
   'street2': None,
   'city': 'CUPERTINO',
   'stateOrCountry': 'CA',
   'zipCode': '95014',
   'stateOrCountryDescription': 'CA'}}}

### Company filings

In [11]:
filing_keys = list(filings['recent'].keys())
filing_keys

['accessionNumber',
 'filingDate',
 'reportDate',
 'acceptanceDateTime',
 'act',
 'form',
 'fileNumber',
 'filmNumber',
 'items',
 'core_type',
 'size',
 'isXBRL',
 'isInlineXBRL',
 'primaryDocument',
 'primaryDocDescription']

In [12]:
filings

{'recent': {'accessionNumber': ['0000320193-25-000036',
   '0000320193-25-000035',
   '0000320193-25-000034',
   '0000320193-25-000033',
   '0000320193-25-000032',
   '0000320193-25-000031',
   '0000320193-25-000030',
   '0001140361-25-005876',
   '0001096906-25-000152',
   '0000320193-25-000022',
   '0000320193-25-000021',
   '0000320193-25-000020',
   '0000320193-25-000019',
   '0000320193-25-000018',
   '0000320193-25-000017',
   '0000320193-25-000016',
   '0001921094-25-000087',
   '0000320193-25-000008',
   '0000320193-25-000007',
   '0001096906-25-000100',
   '0000320193-25-000002',
   '0001308179-25-000009',
   '0001308179-25-000008',
   '0001140361-25-000228',
   '0000320193-24-000132',
   '0001973141-24-000617',
   '0000320193-24-000130',
   '0000320193-24-000129',
   '0001921094-24-001434',
   '0001921094-24-001420',
   '0001921094-24-001407',
   '0000320193-24-000126',
   '0001140361-24-044880',
   '0000320193-24-000123',
   '0000320193-24-000120',
   '0000320193-24-000116',

### Mapping of the latest available forms as key(form), value(index)

In [13]:
mapping_latest_forms_doc_index = {}
for form in important_forms:
    for index, row in enumerate(filings['recent']['form']):
        if str(row) == form:
            mapping_latest_forms_doc_index[form] = index
            break
available_forms = list(mapping_latest_forms_doc_index.keys())

In [14]:
available_forms

['10-K', '10-Q', '8-K', 'DEF 14A', '4']

In [15]:
mapping_latest_forms_doc_index

{'10-K': 33, '10-Q': 17, '8-K': 7, 'DEF 14A': 22, '4': 0}

In [16]:
last_acc_numbers = []
report_dates = []
forms = []
primary_docs = []

for _, index in mapping_latest_forms_doc_index.items():
    last_acc_numbers.append(filings['recent']['accessionNumber'][index].replace('-', ''))
    report_dates.append(filings['recent']['reportDate'][index])
    forms.append(filings['recent']['form'][index])
    primary_docs.append(filings['recent']['primaryDocument'][index])

In [17]:
primary_docs

['aapl-20240928.htm',
 'aapl-20241228.htm',
 'ef20044022_8k.htm',
 'aapl4359751-def14a.htm',
 'xslF345X05/wk-form4_1740699336.xml']

In [18]:
df = pd.DataFrame({
    'accession_number': last_acc_numbers,
    'report_date': report_dates,
    'form': forms,
    'docs':primary_docs
})

In [19]:
df['cik'] = '0000320193'

In [20]:
df

Unnamed: 0,accession_number,report_date,form,docs,cik
0,32019324000123,2024-09-28,10-K,aapl-20240928.htm,320193
1,32019325000008,2024-12-28,10-Q,aapl-20241228.htm,320193
2,114036125005876,2025-02-25,8-K,ef20044022_8k.htm,320193
3,130817925000008,2025-02-25,DEF 14A,aapl4359751-def14a.htm,320193
4,32019325000036,2025-02-25,4,xslF345X05/wk-form4_1740699336.xml,320193


In [21]:
all_filings = []
for _, row in df.iterrows():
    filings_dict = fetch_company_filings(cik=row['cik'], accession=row['accession_number'], filename=row['docs'])
    all_filings.append(filings_dict)

In [22]:
all_filings[0]

b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n<!--XBRL Document Created with the Workiva Platform-->\n<!--Copyright 2024 Workiva-->\n<!--r:6516014a-223b-4792-964c-105c0fc62715,g:fb24cc6b-9929-486d-8f15-d4cad8060a59,d:7bfbfbe54b9647b1b4ba4ff4e0aba09d-->\n<html xmlns:link="http://www.xbrl.org/2003/linkbase" xmlns:iso4217="http://www.xbrl.org/2003/iso4217" xmlns:country="http://xbrl.sec.gov/country/2024" xmlns="http://www.w3.org/1999/xhtml" xmlns:ixt-sec="http://www.sec.gov/inlineXBRL/transformation/2015-08-31" xmlns:dei="http://xbrl.sec.gov/dei/2024" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:ixt="http://www.xbrl.org/inlineXBRL/transformation/2020-02-12" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:aapl="http://www.apple.com/20240928" xmlns:ecd="http://xbrl.sec.gov/ecd/2024" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns:srt="http://fasb.org/srt/2024" xmlns:us-gaap="http:

In [23]:
for index, content in enumerate(all_filings):
    df.at[index, 'content'] = content

In [24]:
df

Unnamed: 0,accession_number,report_date,form,docs,cik,content
0,32019324000123,2024-09-28,10-K,aapl-20240928.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...
1,32019325000008,2024-12-28,10-Q,aapl-20241228.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...
2,114036125005876,2025-02-25,8-K,ef20044022_8k.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...
3,130817925000008,2025-02-25,DEF 14A,aapl4359751-def14a.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...
4,32019325000036,2025-02-25,4,xslF345X05/wk-form4_1740699336.xml,320193,"b'<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.01..."


### Clean content wit Microsoft MarkItDown

In [25]:
for index, row in df.iterrows():
    mid = MarkItDown()
    result_mid = mid.convert_stream(BytesIO(df.at[index, 'content']))
    content_md = result_mid.text_content
    df.loc[index, 'cleaned_markitdown_md'] = content_md

In [41]:
for index, row in df.iterrows():
    if index == 3:
        print(row['cleaned_markitdown_md'])

<?xml version='1.0' encoding='ASCII'?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xs="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xmlns:xbrldt="http://xbrl.org/2005/xbrldt" xmlns:iso4217="http://www.xbrl.org/2003/iso4217" xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns:ixt="http://www.xbrl.org/inlineXBRL/transformation/2015-02-26" xmlns:ixt-sec="http://www.sec.gov/inlineXBRL/transformation/2015-08-31" xmlns:link="http://www.xbrl.org/2003/linkbase" xmlns:dei="http://xbrl.sec.gov/dei/2024" xmlns:us-gaap="http://fasb.org/us-gaap/2024" xmlns:us-roles="http://fasb.org/us-roles/2024" xmlns:dtr-types="http://www.xbrl.org/dtr/type/2022-03-31" xmlns:ecd="http://xbrl.sec.gov/ecd/2024" xmlns:country="http://xbrl.sec.gov/country/2024" xmlns:srt="http://fasb.org/srt/2024" xmlns:aapl="http://apple.com/20250225">
<head>
     <title>Apple Inc.- DEF14A</title>


### Clean Content with Docling

In [29]:
for index, row in df.iterrows():
    try:
        html_stream = DocumentStream(name=f'sec_{index}', stream=BytesIO(df.at[index, 'content']))
        converter = DocumentConverter()
        result = converter.convert(html_stream)
        df.loc[index, 'cleaned_docling_md'] = result.document.export_to_markdown()
    except Exception:
        continue

Input document sec_2 does not match any allowed format.


In [40]:
for index, row in df.iterrows():
    if index==3:
        print(row['cleaned_docling_md'])

Back to Contents

UNITED STATES

SECURITIES AND EXCHANGE COMMISSION

Washington, DC 20549

SCHEDULE 14A

PROXY STATEMENT PURSUANT TO SECTION 14(a)
OF THE SECURITIES EXCHANGE ACT OF 1934
(Amendment No. )

| Check the appropriate box:   | Check the appropriate box:                                                      |
|------------------------------|---------------------------------------------------------------------------------|
|                              | Preliminary Proxy Statement                                                     |
|                              | Confidential, for Use of the Commission Only (as permitted by Rule 14a-6(e)(2)) |
|                              | Definitive Proxy Statement                                                      |
|                              | Definitive Additional Materials                                                 |
|                              | Soliciting Material under §240.14a-12                                    

In [31]:
df

Unnamed: 0,accession_number,report_date,form,docs,cik,content,cleaned_markitdown_md,cleaned_docling_md
0,32019324000123,2024-09-28,10-K,aapl-20240928.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...,<?xml version='1.0' encoding='ASCII'?>\n<!--XB...,| | | |\n|----|----|----|\n| | ...
1,32019325000008,2024-12-28,10-Q,aapl-20241228.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...,<?xml version='1.0' encoding='ASCII'?>\n<!--XB...,| | | |\n|----|----|----|\n| | ...
2,114036125005876,2025-02-25,8-K,ef20044022_8k.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...,<?xml version='1.0' encoding='ASCII'?>\n<html ...,
3,130817925000008,2025-02-25,DEF 14A,aapl4359751-def14a.htm,320193,b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n...,<?xml version='1.0' encoding='ASCII'?>\n<html ...,Back to Contents\n\nUNITED STATES\n\nSECURITIE...
4,32019325000036,2025-02-25,4,xslF345X05/wk-form4_1740699336.xml,320193,"b'<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.01...",SEC Form 4\n\n| | | | | | | | | | | ...,| Table I - Non-Derivative Securities Acquired...


### Timeseries Stock Data yahoo finance

In [None]:
def fetch_stock_data_yf(ticker, period="10y", interval="1mo"):
        if not ticker:
            raise ValueError(f"Fund '{ticker}' not found.")
        
        yf_ticker = yf.Ticker(ticker)
        hist = yf_ticker.history(period=period, interval=interval)
        info = yf_ticker.info
        return hist, info
        

In [None]:
hist, info = fetch_stock_data_yf(cik['ticker'][0])

  hist, info = fetch_stock_data_yf(cik['ticker'][0])


### Timeseries Stock

In [None]:
hist

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-04-01 00:00:00-04:00,27.875761,30.046504,27.491637,27.949459,3984542000,0.00,0.0
2015-05-01 00:00:00-04:00,28.161616,29.695877,27.549698,29.095125,3816608400,0.13,0.0
2015-06-01 00:00:00-04:00,29.216657,29.465586,27.915947,28.128994,3514426800,0.00,0.0
2015-07-01 00:00:00-04:00,28.458661,29.819923,26.736340,27.202803,4233122400,0.00,0.0
2015-08-01 00:00:00-04:00,27.247653,27.487612,20.631968,25.287617,6427276400,0.13,0.0
...,...,...,...,...,...,...,...
2024-11-01 00:00:00-04:00,220.484749,237.287764,219.227521,236.808823,891640600,0.25,0.0
2024-12-01 00:00:00-05:00,237.009422,259.814351,236.899542,250.144974,977916100,0.00,0.0
2025-01-01 00:00:00-05:00,248.656607,248.826433,219.139072,235.740814,1200291700,0.00,0.0
2025-02-01 00:00:00-05:00,229.737410,249.725428,225.452114,241.574387,862272300,0.25,0.0


### Info Stock

In [None]:
info

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and p