# Scrape Data from EDGAR

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sec_edgar_downloader import Downloader

# Create a directory for the filings if it doesn't exist
download_directory = os.path.join(os.getcwd(), "sec-edgar-filings")
os.makedirs(download_directory, exist_ok=True)

# Initialize the downloader
email_address = "2211975083mjz@gmail.com"
dl = Downloader(email_address, download_directory)


# Download 10-K filings for a specific company
dl.get("10-K", "AAPL")  # Replace "AAPL" with the desired ticker symbol

print(f"10-K filings for AAPL have been downloaded to {download_directory}")


# Parse the downloaded filings
filings_dir = os.path.join(download_directory, "AAPL", "10-K")
print(f"Checking directory: {filings_dir}")


# List the contents of the directory
try:
    contents = os.listdir(filings_dir)
    print(f"Contents of {filings_dir}: {contents}")
except Exception as e:
    print(f"Error accessing directory {filings_dir}: {e}")


# Collect all .txt files from subdirectories
txt_files = []
for subdir in contents:
    subdir_path = os.path.join(filings_dir, subdir)
    if os.path.isdir(subdir_path):
        subdir_contents = os.listdir(subdir_path)
        txt_files += [os.path.join(subdir_path, f) for f in subdir_contents if f.endswith('.txt')]


print(f"Collected .txt files: {txt_files}")

# Print the path of the first file if it exists
if txt_files:
    first_filing = txt_files[0]
    print(f"First filing path: {first_filing}")
    
    # Attempt to read and print the content of the first file
    print(f"Reading file: {first_filing}")
    try:
        with open(first_filing, 'r', encoding='utf-8') as file:
            content = file.read()
            if content:
                print(content[:1000])  # Print the first 1000 characters
            else:
                print("File is empty")
    except Exception as e:
        print(f"Error reading file: {e}")
else:
    print("No filings found")

10-K filings for AAPL have been downloaded to c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings
Checking directory: c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings\AAPL\10-K
Contents of c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings\AAPL\10-K: ['0000320193-17-000070', '0000320193-18-000145', '0000320193-19-000119', '0000320193-20-000096', '0000320193-21-000105', '0000320193-22-000108', '0000320193-23-000106', '0000320193-94-000016', '0000320193-95-000016', '0000320193-96-000023', '0000912057-00-053623', '0000912057-99-010244', '0001047469-02-007674', '0001047469-03-041604', '0001047469-04-035975', '0001047469-07-009340', '0001047469-97-006960', '0001104659-05-058421', '0001104659-06-084288', '0001193125-08-224958', '0001193125-09-214859', '0001193125-10-238044', '0001193125-11-282113', '0001193125-12-444068', '0001193125-13-416534', '0001193125-14-383437', '0001193125-15-356351', '0001628280-16-020309']
Collected .txt 

## Extract the text from the filing

1. Identify Key Data Points: Determine which financial data points you need to extract from the 10-K filings (e.g., revenue, net income, etc.).
2. Parse the Content: Use BeautifulSoup to parse the HTML content and extract the required data.
3. Store the Data: Store the extracted data in a structured format, such as a pandas DataFrame.
4. Analyze and Automate: Use the structured data for further analysis and automate the process for future use.

In [21]:
import os
import re
from bs4 import BeautifulSoup
import pandas as pd

# Function to convert a two-digit year to a four-digit year
def convert_two_digit_year(two_digit_year):
    two_digit_year = int(two_digit_year)
    if two_digit_year > 50:  # Assuming years in 1950-1999 range
        return 1900 + two_digit_year
    else:  # Assuming years in 2000-2049 range
        return 2000 + two_digit_year

# Function to inspect and extract financial data from a filing
def inspect_and_extract_filing(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    financial_data = {}

    # Extract the year from the file path (assuming the folder name contains the year)
    parent_dir = os.path.basename(os.path.dirname(file_path))
    year_from_path = parent_dir.split('-')[1]  # Extract the two-digit year
    if year_from_path.isdigit() and len(year_from_path) == 2:
        financial_data['Year'] = str(convert_two_digit_year(year_from_path))
    else:
        # Try to find the year in the file content
        year_match = re.search(r'(Fiscal Year|Year Ended|Report Date) (\d{4})', content, re.IGNORECASE)
        if year_match:
            financial_data['Year'] = year_match.group(2)
        else:
            financial_data['Year'] = "N/A"
    
    # Define the metrics to extract
    metrics = {
        'Net Sales': 'Net sales',
        'Operating Income': 'Operating income',
        'Net Income': 'Net income',
        'Total Assets': 'Total assets',
        'Total Liabilities': 'Total liabilities'
    }

    # Try different parsers
    parsers = ['html.parser', 'lxml', 'html5lib']
    for parser in parsers:
        try:
            soup = BeautifulSoup(content, parser)
            for key, label in metrics.items():
                elements = soup.find_all(string=re.compile(re.escape(label), re.IGNORECASE))
                for element in elements:
                    parent_td = element.find_parent('td')
                    if parent_td:
                        value_td = parent_td.find_next_sibling('td').find_next_sibling('td')
                        if value_td:
                            financial_data[key] = value_td.get_text(strip=True).replace(',', '')
                            break
                else:
                    financial_data[key] = "N/A"
            break  # Exit the loop if parsing is successful
        except Exception as e:
            print(f"Error using parser {parser} on file {file_path}: {e}")
            continue  # Try the next parser if the current one fails

    return financial_data

# Set the download directory
download_directory = os.path.join(os.getcwd(), "sec-edgar-filings")
filings_dir = os.path.join(download_directory, "AAPL", "10-K")

# List the contents of the directory
try:
    contents = os.listdir(filings_dir)
    print(f"Contents of {filings_dir}: {contents}")
except Exception as e:
    print(f"Error accessing directory {filings_dir}: {e}")

# Collect all .txt files from subdirectories
txt_files = []
for subdir in contents:
    subdir_path = os.path.join(filings_dir, subdir)
    if os.path.isdir(subdir_path):
        subdir_contents = os.listdir(subdir_path)
        txt_files += [os.path.join(subdir_path, f) for f in subdir_contents if f.endswith('.txt')]

print(f"Collected .txt files: {txt_files}")

# Collect financial data from all txt files
all_financial_data = []

for file in txt_files:
    financial_data = inspect_and_extract_filing(file)
    if financial_data:
        all_financial_data.append(financial_data)

# Convert the collected data to a pandas DataFrame
df_financial_data = pd.DataFrame(all_financial_data)
print(df_financial_data)

# Save the data to a CSV file
#df_financial_data.to_csv('financial_data.csv', index=False)
#print("Financial data has been saved to financial_data.csv")


Contents of c:\Users\USER\Documents\The Ma's Library\MM_Projects\sec-edgar-filings\AAPL\10-K: ['0000320193-17-000070', '0000320193-18-000145', '0000320193-19-000119', '0000320193-20-000096', '0000320193-21-000105', '0000320193-22-000108', '0000320193-23-000106', '0000320193-94-000016', '0000320193-95-000016', '0000320193-96-000023', '0000912057-00-053623', '0000912057-99-010244', '0001047469-02-007674', '0001047469-03-041604', '0001047469-04-035975', '0001047469-07-009340', '0001047469-97-006960', '0001104659-05-058421', '0001104659-06-084288', '0001193125-08-224958', '0001193125-09-214859', '0001193125-10-238044', '0001193125-11-282113', '0001193125-12-444068', '0001193125-13-416534', '0001193125-14-383437', '0001193125-15-356351', '0001628280-16-020309']
Collected .txt files: ["c:\\Users\\USER\\Documents\\The Ma's Library\\MM_Projects\\sec-edgar-filings\\AAPL\\10-K\\0000320193-17-000070\\full-submission.txt", "c:\\Users\\USER\\Documents\\The Ma's Library\\MM_Projects\\sec-edgar-filin