# Scrape Data from EDGAR

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sec_edgar_downloader import Downloader

# Create a directory for the filings if it doesn't exist
download_directory = os.path.join(os.getcwd(), "sec-edgar-filings")
os.makedirs(download_directory, exist_ok=True)

# Initialize the downloader
email_address = "2211975083mjz@gmail.com"
dl = Downloader(email_address, download_directory)


# Download 10-K filings for a specific company
dl.get("10-K", "AAPL")  # Replace "AAPL" with the desired ticker symbol

print(f"10-K filings for AAPL have been downloaded to {download_directory}")


# Parse the downloaded filings
filings_dir = os.path.join(download_directory, "AAPL", "10-K")
print(f"Checking directory: {filings_dir}")


# List the contents of the directory
try:
    contents = os.listdir(filings_dir)
    print(f"Contents of {filings_dir}: {contents}")
except Exception as e:
    print(f"Error accessing directory {filings_dir}: {e}")


# Collect all .txt files from subdirectories
txt_files = []
for subdir in contents:
    subdir_path = os.path.join(filings_dir, subdir)
    if os.path.isdir(subdir_path):
        subdir_contents = os.listdir(subdir_path)
        txt_files += [os.path.join(subdir_path, f) for f in subdir_contents if f.endswith('.txt')]


print(f"Collected .txt files: {txt_files}")

# Print the path of the first file if it exists
if txt_files:
    first_filing = txt_files[0]
    print(f"First filing path: {first_filing}")
    
    # Attempt to read and print the content of the first file
    print(f"Reading file: {first_filing}")
    try:
        with open(first_filing, 'r', encoding='utf-8') as file:
            content = file.read()
            if content:
                print(content[:1000])  # Print the first 1000 characters
            else:
                print("File is empty")
    except Exception as e:
        print(f"Error reading file: {e}")
else:
    print("No filings found")

10-K filings for AAPL have been downloaded to c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings
Checking directory: c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings\AAPL\10-K
Contents of c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings\AAPL\10-K: ['0000320193-17-000070', '0000320193-18-000145', '0000320193-19-000119', '0000320193-20-000096', '0000320193-21-000105', '0000320193-22-000108', '0000320193-23-000106', '0000320193-94-000016', '0000320193-95-000016', '0000320193-96-000023', '0000912057-00-053623', '0000912057-99-010244', '0001047469-02-007674', '0001047469-03-041604', '0001047469-04-035975', '0001047469-07-009340', '0001047469-97-006960', '0001104659-05-058421', '0001104659-06-084288', '0001193125-08-224958', '0001193125-09-214859', '0001193125-10-238044', '0001193125-11-282113', '0001193125-12-444068', '0001193125-13-416534', '0001193125-14-383437', '0001193125-15-356351', '0001628280-16-020309']
Collected .txt 

## Extract the text from the filing

1. Identify Key Data Points: Determine which financial data points you need to extract from the 10-K filings (e.g., revenue, net income, etc.).
2. Parse the Content: Use BeautifulSoup to parse the HTML content and extract the required data.
3. Store the Data: Store the extracted data in a structured format, such as a pandas DataFrame.
4. Analyze and Automate: Use the structured data for further analysis and automate the process for future use.

In [4]:
import os
from bs4 import BeautifulSoup
import pandas as pd
from sec_edgar_downloader import Downloader

# Function to inspect and extract financial data from a filing
def inspect_and_extract_filing(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

        # Try different parsers
        parsers = ['lxml', 'html5lib', 'html.parser']
        for parser in parsers:
            try:
                soup = BeautifulSoup(content, parser)

                # Find all instances of "Net sales" and extract the adjacent value
                net_sales = None
                net_sales_elements = soup.find_all('div', text='Net sales')
                for element in net_sales_elements:
                    # Navigate to the parent td, and find the next sibling td with the sales value
                    parent_td = element.find_parent('td')
                    if parent_td:
                        sales_td = parent_td.find_next_sibling('td').find_next_sibling('td')
                        if sales_td:
                            net_sales = sales_td.get_text(strip=True)
                            break
                
                if net_sales:
                    return net_sales
            except Exception as e:
                print(f"Error using parser {parser} on file {file_path}: {e}")
        
        # If no parser works, return None
        return None


# Test the extraction on a single file
if txt_files:
    test_file = txt_files[0]
    print(f"Testing file: {test_file}")
    
    # Attempt to extract revenue from the test file
    revenue =  inspect_and_extract_filing(test_file)
    if revenue:
        print(f"Extracted revenue: {revenue}")
    else:
        print("No revenue information found")
else:
    print("No filings found")

Testing file: c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings\AAPL\10-K\0000320193-17-000070\full-submission.txt


  net_sales_elements = soup.find_all('div', text='Net sales')


Extracted revenue: 229,234
