### Import libraries and download log

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import os
import time

# Import log to merge metadata later
log = pd.read_csv('download_log.csv')
display(log.head())

Unnamed: 0,Ticker,Title,CIK,AccessionNumber,Form,Date,URL
0,XOM,EXXON MOBIL CORP,34088,95010324006564,8-K,2024-05-10,https://www.sec.gov/Archives/edgar/data/000003...
1,XOM,EXXON MOBIL CORP,34088,95010324006322,8-K,2024-05-03,https://www.sec.gov/Archives/edgar/data/000003...
2,XOM,EXXON MOBIL CORP,34088,3408824000029,10-Q,2024-04-29,https://www.sec.gov/Archives/edgar/data/000003...
3,XOM,EXXON MOBIL CORP,34088,3408824000025,8-K,2024-04-26,https://www.sec.gov/Archives/edgar/data/000003...
4,XOM,EXXON MOBIL CORP,34088,3408824000021,8-K,2024-04-03,https://www.sec.gov/Archives/edgar/data/000003...


### Test code for cleaning HTML and Tables

In [2]:
# Get a few sample files names to test
save_directory = os.path.join("..", "SEC_data")
file_list = os.listdir(save_directory)

# Manually searching for an 8-K, 10-K, and 10-Q
test_file_1 = file_list[0]
test_file_2 = file_list[2]
test_file_3 = file_list[5]
print(test_file_1)
print(test_file_2)
print(test_file_3)

000003408819000002_XOM_8-K_2019-02-01.html
000003408819000010_XOM_10-K_2019-02-27.html
000003408819000017_XOM_10-Q_2019-05-02.html


In [3]:
# Store files in a list
test_files = [test_file_1, test_file_2, test_file_3]

# Loop through the list to view text at different cleaned stages
for file_name in test_files:
    file_path = os.path.join(save_directory, file_name)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        # Raw html
        html_content = file.read()
        
        # Soup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Get the unique HTML tag structure
        unique_tags = set()
        stack = [(soup, 0)]
        
        while stack:
            element, indent = stack.pop()
            if element.name is not None:  # Check if the element has a name and add it with indentation
                unique_tags.add((' ' * indent) + element.name)
            children = list(element.children)
            for child in children:
                if isinstance(child, Tag):  # Consider only Tag elements
                    stack.append((child, indent + 2))
        
        # Print the results
        print("="*117)
        print(f"Content from {file_name}:\n")
        
        # HTML Tags
        print("=" * 50 + " Unique HTML Tag Structure: " + "=" * 46)
        for tag in sorted(list(unique_tags), reverse=True):
            print(tag)
        
        # Raw HTML
        print("="*50 + "Raw HTML content:" + "="*50)
        print(html_content[:1000])
        
        # Soup
        print("="*50 + "Soup content:" + "="*54)
        print(soup.get_text()[:1000])

Content from 000003408819000002_XOM_8-K_2019-02-01.html:

[document]
  document
    type
      sequence
        filename
          description
            text
              html
                head
                body
                  meta
                  hr
                  div
                  a
                    p
                    font
                    div
                      table
                      p
                      font
                        tr
                        font
                        b
                          td
                          font
                          b
                            p
                            font
                            br
                              font
                                u
                                font
                                b
                                a
                                  u
                                  font
                              

In [4]:
# Filter out everything above the body tag
# Loop through the list to view text at different cleaned stages
for file_name in test_files:
    file_path = os.path.join(save_directory, file_name)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        body = soup.body
        
        # Print the results
        print("="*117)
        print(f"Content from {file_name}:\n")
        print("="*50 + "Body content:" + "="*54)
        print(body.get_text()[:1000])

Content from 000003408819000002_XOM_8-K_2019-02-01.html:



 

 
 
 
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION  
Washington, D.C.
 20549  
 
 
FORM 8-K
 
 
CURRENT REPORT
Pursuant to Section 13
OR 15(d) of The Securities Exchange Act of 1934
 
 
 
 
Date of Report (Date of earliest event reported):  February
1, 2019
 
Exxon Mobil Corporation
(Exact name of registrant as specified in its charter)
 
 
 




New Jersey


1-2256


13-5409005




(State or other jurisdiction


(Commission


(IRS Employer 




of incorporation)


File Number)


Identification No.)




 
 
 
5959 LAS COLINAS BOULEVARD, IRVING, TEXAS 75039-2298
          (Address of principal executive
offices)                                            (Zip Code)
 
 
 Registrant’s telephone number, including area code:  972-940-6000  
 
 
 




 


 


 




 


(Former name or former address, if changed since last
  report)


 




 
Check the appropriate box below if the Form 8-K filing is intended to
simultaneously 

In [5]:
# Remove anything with the table tag
# Loop through the list to view text at different cleaned stages
for file_name in test_files:
    file_path = os.path.join(save_directory, file_name)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        body = soup.body
        
        # Print the body size
        print("="*117)
        print(f"Content from {file_name}:\n")
        body_length = len(body.get_text())
        print(f"Length of the body content: {body_length}")
        
        # Remove all table tags from the body
        for table in body.find_all('table'):
            table.decompose()
        
        # Print the reduced body size
        body_length = len(body.get_text())
        print(f"Length of the body content after removing tables: {body_length}")

Content from 000003408819000002_XOM_8-K_2019-02-01.html:

Length of the body content: 3637
Length of the body content after removing tables: 2258
Content from 000003408819000010_XOM_10-K_2019-02-27.html:

Length of the body content: 476911
Length of the body content after removing tables: 269253
Content from 000003408819000017_XOM_10-Q_2019-05-02.html:

Length of the body content: 85840
Length of the body content after removing tables: 41086


In [6]:
# Compare full 10-K and remove extra new lines
file_path = os.path.join(save_directory, test_file_2)

with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    body = soup.body
    
    # Get the body text and remove duplicate newlines
    body_text = body.get_text()
    cleaned_body_text = re.sub(r'\n+', '\n', body_text)

    # Print original text
    body_length = len(cleaned_body_text)
    print(f"Length of the body content: {body_length}")
    print(cleaned_body_text)
    print("="*117)
    print()

Length of the body content: 415662

 
 
2018  
  
UNITED STATES
SECURITIES AND
EXCHANGE COMMISSION 
WASHINGTON, D.C. 20549
 
 
 
FORM 10-K 
 ☑     ANNUAL REPORT PURSUANT TO SECTION
13 OR 15(d) OF 
THE SECURITIES EXCHANGE ACT OF 1934 
For
the fiscal year ended December 31, 2018 
or 
 ☐     TRANSITION REPORT PURSUANT TO
SECTION 13 OR 15(d) OF 
THE SECURITIES EXCHANGE ACT OF 1934 
For the transition period
from
              
to
              
Commission File Number 1-2256 
EXXON MOBIL CORPORATION 
(Exact name of
registrant as specified in its charter) 
 
 
 
NEW JERSEY
13-5409005
(State or other
  jurisdiction of
incorporation or
  organization)
(I.R.S. Employer
Identification Number)
5959 LAS COLINAS
BOULEVARD, IRVING, TEXAS 75039-2298 
(Address of principal
executive offices) (Zip Code) 
(972) 940-6000 
(Registrant’s telephone
number, including area code) 
 
 
 
Securities registered
pursuant to Section 12(b) of the Act: 
 
 
 
Title of Each
   Class
Name of Each Exchange
on Which Regi

In [7]:
# Print comparison in 2nd box to find extra text easier
file_path = os.path.join(save_directory, test_file_2)

with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    body = soup.body

    # Remove all table tags from the body
    for table in body.find_all('table'):
        table.decompose()
        
    # Get the body text and remove duplicate newlines
    body_text = body.get_text()
    cleaned_body_text = re.sub(r'\n+', '\n', body_text)

    # Print modified body
    body_length = len(cleaned_body_text)
    print(f"Length of the body content after removing tables: {body_length}")
    print(cleaned_body_text)

Length of the body content after removing tables: 268315

 
 
2018  
UNITED STATES
SECURITIES AND
EXCHANGE COMMISSION 
WASHINGTON, D.C. 20549
FORM 10-K 
 ☑     ANNUAL REPORT PURSUANT TO SECTION
13 OR 15(d) OF 
THE SECURITIES EXCHANGE ACT OF 1934 
For
the fiscal year ended December 31, 2018 
or 
 ☐     TRANSITION REPORT PURSUANT TO
SECTION 13 OR 15(d) OF 
THE SECURITIES EXCHANGE ACT OF 1934 
For the transition period
from
              
to
              
Commission File Number 1-2256 
EXXON MOBIL CORPORATION 
(Exact name of
registrant as specified in its charter) 
 
5959 LAS COLINAS
BOULEVARD, IRVING, TEXAS 75039-2298 
(Address of principal
executive offices) (Zip Code) 
(972) 940-6000 
(Registrant’s telephone
number, including area code) 
Securities registered
pursuant to Section 12(b) of the Act: 
 
Indicate by check mark if the registrant is
a well-known seasoned issuer, as defined in Rule 405 of the Securities
Act.    Yes    ☑     No   ☐ 
Indicate by check mark if the registrant is


### Clean HTML files and put into DF

In [8]:
# Initialize variables
start_time = time.time()
save_directory = os.path.join("..", "SEC_data")
file_list = os.listdir(save_directory)
data = []
file_count = 0

# Loop through the HTML files in the SEC_data directory and add text to a df
for file_name in file_list:
    file_path = os.path.join(save_directory, file_name)

    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        # Convert HTML to plain text, remove tables and extra new lines
        soup = BeautifulSoup(html_content, 'html.parser')
        body = soup.body

        if body is not None:
            # Remove all table tags from the body
            for table in body.find_all('table'):
                table.decompose()

            # Get the body text and remove duplicate newlines
            body_text = body.get_text()
            cleaned_body_text = re.sub(r'\n+', '\n', body_text)
        
            # Extract information from the file name
            access_number, ticker, form, date = file_name.split('_')
            date = date.replace('.html', '')

            # Add the content to the data list
            data.append({
                'AccessionNumber': access_number,
                'Ticker': ticker,
                'Date': date,
                'Form': form,
                'Text': cleaned_body_text
            })

    # Provide updates every 10 documents
    file_count += 1
    if file_count % 10 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {file_count} documents. Most recent ticker: {ticker}. Elapsed time: {elapsed_time:.2f} seconds.")

# Create a DataFrame from the data list
df = pd.DataFrame(data)
display(df.head())

Processed 10 documents. Most recent ticker: XOM. Elapsed time: 15.13 seconds.
Processed 20 documents. Most recent ticker: XOM. Elapsed time: 28.66 seconds.
Processed 30 documents. Most recent ticker: XOM. Elapsed time: 34.76 seconds.
Processed 40 documents. Most recent ticker: XOM. Elapsed time: 40.81 seconds.
Processed 50 documents. Most recent ticker: XOM. Elapsed time: 43.08 seconds.
Processed 60 documents. Most recent ticker: XOM. Elapsed time: 49.76 seconds.
Processed 70 documents. Most recent ticker: XOM. Elapsed time: 53.33 seconds.
Processed 80 documents. Most recent ticker: XOM. Elapsed time: 59.93 seconds.
Processed 90 documents. Most recent ticker: XOM. Elapsed time: 63.46 seconds.
Processed 100 documents. Most recent ticker: XOM. Elapsed time: 70.26 seconds.
Processed 110 documents. Most recent ticker: CVX. Elapsed time: 90.85 seconds.
Processed 120 documents. Most recent ticker: CVX. Elapsed time: 103.69 seconds.
Processed 130 documents. Most recent ticker: CVX. Elapsed ti

Unnamed: 0,AccessionNumber,Ticker,Date,Form,Text
0,3408819000002,XOM,2019-02-01,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...
1,3408819000007,XOM,2019-02-26,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...
2,3408819000010,XOM,2019-02-27,10-K,\n \n \n2018 \nUNITED STATES\nSECURITIES AND\...
3,3408819000013,XOM,2019-04-10,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...
4,3408819000015,XOM,2019-04-26,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...


### Merge cleaned DF with metadata and download to CSV

In [9]:
# Add metadata from log
merged_df = pd.merge(df, log[['AccessionNumber', 'URL']], how='left', on='AccessionNumber')
display(merged_df.head())

Unnamed: 0,AccessionNumber,Ticker,Date,Form,Text,URL
0,3408819000002,XOM,2019-02-01,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...,https://www.sec.gov/Archives/edgar/data/000003...
1,3408819000007,XOM,2019-02-26,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...,https://www.sec.gov/Archives/edgar/data/000003...
2,3408819000010,XOM,2019-02-27,10-K,\n \n \n2018 \nUNITED STATES\nSECURITIES AND\...,https://www.sec.gov/Archives/edgar/data/000003...
3,3408819000013,XOM,2019-04-10,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...,https://www.sec.gov/Archives/edgar/data/000003...
4,3408819000015,XOM,2019-04-26,8-K,\n \n \n \n \nUNITED STATES\nSECURITIES AND EX...,https://www.sec.gov/Archives/edgar/data/000003...


In [10]:
# Sort
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df = merged_df.sort_values(by=['Ticker', 'Date'])
display(merged_df.head())

Unnamed: 0,AccessionNumber,Ticker,Date,Form,Text,URL
522,114036119010687,BP,2019-06-07,11-K,\nSECURITIES AND EXCHANGE COMMISSION\nWashingt...,https://www.sec.gov/Archives/edgar/data/000031...
256,31380720000008,BP,2020-06-23,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...
257,31380721000009,BP,2021-06-11,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...
258,31380722000025,BP,2022-06-10,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...
259,31380723000020,BP,2023-06-13,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...


In [11]:
# Export to CSV
output_path = os.path.join("..", "SEC_filings.csv")
merged_df.to_csv(output_path, index=False)