In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import time

# Import log to merge metadata later
log = pd.read_csv('download_log.csv')
display(log.head())

Unnamed: 0,Ticker,Title,CIK,AccessionNumber,Form,Date,URL
0,XOM,EXXON MOBIL CORP,34088,95010324006564,8-K,2024-05-10,https://www.sec.gov/Archives/edgar/data/000003...
1,XOM,EXXON MOBIL CORP,34088,95010324006322,8-K,2024-05-03,https://www.sec.gov/Archives/edgar/data/000003...
2,XOM,EXXON MOBIL CORP,34088,3408824000029,10-Q,2024-04-29,https://www.sec.gov/Archives/edgar/data/000003...
3,XOM,EXXON MOBIL CORP,34088,3408824000025,8-K,2024-04-26,https://www.sec.gov/Archives/edgar/data/000003...
4,XOM,EXXON MOBIL CORP,34088,3408824000021,8-K,2024-04-03,https://www.sec.gov/Archives/edgar/data/000003...


In [2]:
# Initialize variables
start_time = time.time()
save_directory = os.path.join("..", "SEC_data")
data = []
file_count = 0

# Loop through the HTML files in the SEC_data directory and add text to a df
for file_name in os.listdir(save_directory):
    file_path = os.path.join(save_directory, file_name)

    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        # Convert HTML to plain text, this needs some work
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator='\n')
        
        # Extract information from the file name
        access_number, ticker, form, date = file_name.split('_')
        date = date.replace('.html', '')
        
        # Add the content to the data list
        data.append({
            'AccessionNumber': access_number,
            'Ticker': ticker,
            'Date': date,
            'Form': form,
            'Text': text
        })

    # Provide updates every 10 documents
    file_count += 1
    if file_count % 10 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {file_count} documents. Most recent ticker: {ticker}. Elapsed time: {elapsed_time:.2f} seconds.")

# Create a DataFrame from the data list
df = pd.DataFrame(data)
display(df.head())

Processed 10 documents. Most recent ticker: XOM. Elapsed time: 7.53 seconds.
Processed 20 documents. Most recent ticker: XOM. Elapsed time: 19.39 seconds.
Processed 30 documents. Most recent ticker: XOM. Elapsed time: 24.46 seconds.
Processed 40 documents. Most recent ticker: XOM. Elapsed time: 29.19 seconds.
Processed 50 documents. Most recent ticker: XOM. Elapsed time: 31.27 seconds.
Processed 60 documents. Most recent ticker: XOM. Elapsed time: 36.23 seconds.
Processed 70 documents. Most recent ticker: XOM. Elapsed time: 38.18 seconds.
Processed 80 documents. Most recent ticker: XOM. Elapsed time: 43.30 seconds.
Processed 90 documents. Most recent ticker: XOM. Elapsed time: 45.90 seconds.
Processed 100 documents. Most recent ticker: XOM. Elapsed time: 52.69 seconds.
Processed 110 documents. Most recent ticker: CVX. Elapsed time: 68.88 seconds.
Processed 120 documents. Most recent ticker: CVX. Elapsed time: 79.44 seconds.
Processed 130 documents. Most recent ticker: CVX. Elapsed time

Unnamed: 0,AccessionNumber,Ticker,Date,Form,Text
0,3408819000002,XOM,2019-02-01,8-K,\n\n8-K\n\n1\n\nr8k020119.htm\n\nFORM 8-K\n\n\...
1,3408819000007,XOM,2019-02-26,8-K,\n\n8-K\n\n1\n\nr8k022619.htm\n\nFORM 8-K\n\n\...
2,3408819000010,XOM,2019-02-27,10-K,\n\n10-K\n\n1\n\nxom10k2018.htm\n\nFORM 10-K\n...
3,3408819000013,XOM,2019-04-10,8-K,\n\n8-K\n\n1\n\nr8k041019.htm\n\nFORM 8-K\n\n\...
4,3408819000015,XOM,2019-04-26,8-K,\n\n8-K\n\n1\n\nr8k042619.htm\n\nFORM 8-K\n\n\...


In [3]:
# Add metadata from log
merged_df = pd.merge(df, log[['AccessionNumber', 'URL']], how='left', on='AccessionNumber')
display(merged_df.head())

Unnamed: 0,AccessionNumber,Ticker,Date,Form,Text,URL
0,3408819000002,XOM,2019-02-01,8-K,\n\n8-K\n\n1\n\nr8k020119.htm\n\nFORM 8-K\n\n\...,https://www.sec.gov/Archives/edgar/data/000003...
1,3408819000007,XOM,2019-02-26,8-K,\n\n8-K\n\n1\n\nr8k022619.htm\n\nFORM 8-K\n\n\...,https://www.sec.gov/Archives/edgar/data/000003...
2,3408819000010,XOM,2019-02-27,10-K,\n\n10-K\n\n1\n\nxom10k2018.htm\n\nFORM 10-K\n...,https://www.sec.gov/Archives/edgar/data/000003...
3,3408819000013,XOM,2019-04-10,8-K,\n\n8-K\n\n1\n\nr8k041019.htm\n\nFORM 8-K\n\n\...,https://www.sec.gov/Archives/edgar/data/000003...
4,3408819000015,XOM,2019-04-26,8-K,\n\n8-K\n\n1\n\nr8k042619.htm\n\nFORM 8-K\n\n\...,https://www.sec.gov/Archives/edgar/data/000003...


In [5]:
# Sort
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df = merged_df.sort_values(by=['Ticker', 'Date'])
display(merged_df.head())

Unnamed: 0,AccessionNumber,Ticker,Date,Form,Text,URL
522,114036119010687,BP,2019-06-07,11-K,\n\n11-K\n\n1\n\nform11k.htm\n\n11-K\n\n\n\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
256,31380720000008,BP,2020-06-23,11-K,\n\n11-K\n\n1\n\na2019form11-k.htm\n\n11-K\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
257,31380721000009,BP,2021-06-11,11-K,\n\n11-K\n\n1\n\na2020form11-k.htm\n\n11-K\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
258,31380722000025,BP,2022-06-10,11-K,\n\n11-K\n\n1\n\na2021form11-k.htm\n\n11-K\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
259,31380723000020,BP,2023-06-13,11-K,\n\n11-K\n\n1\n\na2022form11-k1.htm\n\n11-K\n\...,https://www.sec.gov/Archives/edgar/data/000031...


In [6]:
# Export to CSV
output_path = os.path.join("..", "SEC_filings.csv")
merged_df.to_csv(output_path, index=False)