In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import wrds

load_dotenv()  # take environment variables from .env.

True

In [2]:
previously_downloaded_index = True

## Download SEC filings list from WRDS

In [3]:
if previously_downloaded_index == False:
    wrds_username = os.getenv('WRDS_USERNAME')
    wrds_password = os.getenv('WRDS_PASSWORD')
    db_wrds = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password)

    query = """
        SELECT *
        FROM wrdssec.forms
    """
    
    # Execute the SQL query and load the result into a pandas DataFrame
    wrds_filings = db_wrds.raw_sql(query)
    wrds_filings

    wrds_filings["url"] = ("https://www.sec.gov/Archives/edgar/data/" + wrds_filings["wrdsfname"].str.split("/", n=1, expand=True)[1])
    
    import uuid

    # generate a list of 16 character string identifiers using the uuid module's hex attribute
    id_list = [uuid.uuid4().hex for i in range(len(wrds_filings))]
    
    # add the id column to the DataFrame
    wrds_filings['id'] = id_list
    
    wrds_filings.to_csv('../Data/WRDS_SEC_1994_2022.csv') 
    
else:
    wrds_filings = pd.read_csv('../Data/WRDS_SEC_1994_2022.csv')

In [4]:
wrds_filings

Unnamed: 0.1,Unnamed: 0,fdate,cik,findexdate,lindexdate,form,coname,wrdsfname,fsize,doccount,...,regcity_hdq,regstate_hdq,regzip_hdq,regstate_inc,regphone,regfconame,regfchangedate,regirs,url,id
0,0,20000330,1952,20000330,20000331,10-K,IREX CORP,000000/1952/0000001952-00-000002.txt,148544.0,2.0,...,LANCASTER,PA,17603,PA,7173973633,AC&S CORP,19840409.0,,https://www.sec.gov/Archives/edgar/data/1952/0...,5c5b09b4b4a3402fa2c14558bd3485bb
1,1,19960401,1952,19960401,19960630,10-K,IREX CORP,000000/1952/0000001952-96-000002.txt,125164.0,2.0,...,LANCASTER,PA,17603,PA,7173973633,AC&S CORP,19840409.0,,https://www.sec.gov/Archives/edgar/data/1952/0...,c1dde5dedef040778afc00427100721f
2,2,19970331,1952,19970331,19970331,10-K,IREX CORP,000000/1952/0000001952-97-000003.txt,118062.0,2.0,...,LANCASTER,PA,17603,PA,7173973633,AC&S CORP,19840409.0,,https://www.sec.gov/Archives/edgar/data/1952/0...,b7e515c5bd65414fb00dca9a45fc8546
3,3,19980401,1952,19980401,19980630,10-K,IREX CORP,000000/1952/0000001952-98-000002.txt,134526.0,2.0,...,LANCASTER,PA,17603,PA,7173973633,AC&S CORP,19840409.0,,https://www.sec.gov/Archives/edgar/data/1952/0...,24c243e0c19f4cfc916ad469b1753ac6
4,4,19990401,1952,19990401,19990630,10-K,IREX CORP,000000/1952/0000001952-99-000006.txt,146123.0,2.0,...,LANCASTER,PA,17603,PA,7173973633,AC&S CORP,19840409.0,,https://www.sec.gov/Archives/edgar/data/1952/0...,24b033f8b53e4b8892ca0d83a56c601f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279794,279794,20080303,923603,20080311,20080331,10-K,FELCOR LODGING TRUST INC,000092/923603/9999999997-08-010237.txt,2820.0,1.0,...,IRVING,TX,75062,MD,9724444900,FELCOR LODGING TRUST INC,19980810.0,,https://www.sec.gov/Archives/edgar/data/923603...,32fc49be5a42410a983df16d55d0fc9d
279795,279795,20080303,923603,20080311,20080331,10-K,FELCOR LODGING TRUST INC,000092/923603/9999999997-08-010237.txt,2820.0,1.0,...,IRVING,TX,75062,MD,9724444900,FELCOR LODGING TRUST INC,19980810.0,,https://www.sec.gov/Archives/edgar/data/923603...,f86709239d624fe7b2b3961ff1a82b1d
279796,279796,20120720,730000,20120720,20120930,10-K,SUPERTEX INC,000073/730000/9999999997-12-011895.txt,1259.0,1.0,...,SUNNYVALE,CA,94089,CA,4087440100,,,,https://www.sec.gov/Archives/edgar/data/730000...,c19dee39f3f54f28a7c5ed46c1a49633
279797,279797,20120720,885074,20120720,20120720,10-K,AUTHENTIDATE HOLDING CORP,000088/885074/9999999997-12-011898.txt,1493.0,1.0,...,BERKELEY HEIGHT,NJ,07922,DE,9087871700,BITWISE DESIGNS INC,19930328.0,,https://www.sec.gov/Archives/edgar/data/885074...,f219f66400064ce9b4422a947ccc186c


## Download Filings

In [5]:
from sec_api import ExtractorApi
sec_api_key = os.getenv('SEC_API')
extractorApi = ExtractorApi(sec_api_key)

In [6]:
# extract Item 1 section of each filing
def get_item(url, item):
    try:
        section_html = extractorApi.get_section(url, item, "html")
        return section_html
    except:
        return "SEC_API ERROR -- An error occurred while trying to get the requested section."


# Save the filing in a file
def save_file(file_path, section_html):
    with open(file_path, "wb") as f:
        f.write(section_html)
        f.close()
        
# iterate through the list of filings
def save_item1(filings_index_row):
    url = filings_index_row['url']
    file_id = filings_index_row['id']
    response = get_item(url, "1")
    file_name = os.path.join("../Data/Item1_wrds_html", file_id)
    print(filings_index_row.id)
    save_file(file_name, response.encode('utf-8'))

In [7]:
%%capture
wrds_filings.apply(lambda x: save_item1(x), axis=1)
print('Finished downloading the filings')

# Parse HTML to Text

In [1]:
from bs4 import BeautifulSoup

In [2]:

# get a list of all files in the folder
files_list = os.listdir("../Data/Item1_wrds_html")
files_list.remove('.DS_Store')


In [5]:

# Remove tables, images, headers, footers, and comments from html
def clean_soup(soup):
 
            # remove the tables
            for table in soup("table"):
                table.decompose()
        
            # remove the images
            for image in soup("img"):
                image.decompose()
        
            # remove the header
            for header in soup("header"):
                header.decompose()
        
            # remove the footer
            for footer in soup("footer"):
                footer.decompose()

            # remove the footer
            for comment in soup("comment"):
                comment.decompose()

            # return the text
            return soup.get_text()

def clean_text(text):
    text = text.replace('\n', '.')
    text = ' '.join(text.split())
    return text


for file in files_list:
    
    # get the file name
    file_name = file
    file_path = '../Data/Item1_wrds_html/' + file_name

    # read the html file
    with open(file_path, "r") as f:
        content = f.read()
    
    # create a BeautifulSoup object
    soup = BeautifulSoup(content, "html.parser")
    
    # clean the soup
    text = clean_soup(soup)
    
    # clean the text
    text = clean_text(text)
    
    # save the text to a new file in the dropbox/folder2 directory
    with open("../Data/Item1_wrds_html_text/" + file_name + ".txt", "w") as f:
        f.write(text)



  soup = BeautifulSoup(content, "html.parser")
