Downloading SEC-Data from EDGAR
Index is built with *python-edgar* and afterwards the required sec files are downloaded

In [1]:
pip install python-edgar

Collecting python-edgar
  Downloading python_edgar-3.1.3-py3-none-any.whl (8.6 kB)
Installing collected packages: python-edgar
Successfully installed python-edgar-3.1.3


In [2]:
#pip install python-edgar

import csv
import json
import logging
import os.path
import pathlib
import time
import urllib.request

import edgar
import pandas

# log level
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
# parameters
since_year = 2007
user_agent = "mathias.marxer@ost.ch"
edgar_prefix_url = "https://www.sec.gov/Archives/"

sec_ciks = []
# Top 100 companies by marketcap
# based on https://companiesmarketcap.com/usa/largest-companies-in-the-usa-by-market-cap/ (as of 15.11.2022)
# Additional: Washington Prime Group Inc., Christopher & Banks Corp
# sec_ciks = [320193, 789019, 1652044, 1018724, 1067983, 1318605, 731766, 34088, 200406, 1403161,
#             1045810, 19617, 104169, 93410, 59478, 80424, 1141391, 354950, 70858, 1326801,
#             78003, 1551152, 317540, 310158, 77476, 909832, 1341439, 97745, 1730168, 63908,
#             313616, 858877, 1283699, 72971, 1800, 1744489, 1163165, 320187, 14272, 753308,
#             97476, 732712, 1108524, 796343, 1090727, 318154, 895421, 1166691, 764180, 773840,
#             87634, 101829, 732717, 804328, 100885, 1065280, 886982, 51143, 64803, 60667,
#             50863, 18230, 315189, 936468, 2488, 1156039, 4962, 64040, 896878, 829224,
#             1364742, 882095, 1045609, 8670, 12927, 1633917, 1053507, 831001, 1739940, 40545,
#             6951, 1035267, 1103982, 821189, 109198, 6281, 310764, 62709, 1373715, 872589,
#             27419, 764180, 1001250, 875320, 1075531, 87347, 1133421, 1326160, 1393818, 66740,
#             1594686, 883943]
sec_types = ['10-Q', '10-K']

skip_present_files = True
index_columns = ['CIK', 'Company', 'type', 'date', 'url_txt', 'url_html']

data_directory = "/content/drive/MyDrive/Histdata/"
index_directory = "sec-index"

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
ls "/content/drive/MyDrive/Histdata/"

company_tickers.json  [0m[01;34mfiles[0m/  hist_weekly_data1.csv  hist_weekly_data2.csv


# New Section

In [6]:
def load_ciks_from_historical_data():
    # read first row (company tickers) of historical data
    with open(file=os.path.join(data_directory, "hist_weekly_data1.csv"), newline='') as data_file:
        reader = csv.reader(data_file, delimiter=';')
        company_tickers = set(filter(None, next(reader)))

    with open(file=os.path.join(data_directory, "hist_weekly_data2.csv"), newline='') as data_file:
        reader = csv.reader(data_file, delimiter=';')
        company_tickers.update(set(filter(None, next(reader))))

    # import cik lookup
    with open(file=os.path.join(data_directory, "company_tickers.json")) as cik_file:
        cik_lookup = json.loads(cik_file.read())

    # lookup cik
    ciks = []
    for ticker in company_tickers:
        for cik in cik_lookup.values():
            if cik["ticker"] == ticker:
                ciks.append(cik["cik_str"])
                break

    if len(company_tickers) != len(ciks):
        logging.warning(
            f'Not all tickers could be found as CIK. Company tickers={len(company_tickers)}, CIKs={len(ciks)}')

    return ciks

def get_millis():
    return round(time.time() * 1000)

In [7]:
# download index files from edgar
edgar.download_index(dest=index_directory, since_year=since_year, user_agent=user_agent,
                     skip_all_present_except_last=skip_present_files)

INFO:root:68 index files to retrieve
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2023/QTR4/master.zip to sec-index/2023-QTR4.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2023/QTR3/master.zip to sec-index/2023-QTR3.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2023/QTR2/master.zip to sec-index/2023-QTR2.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2023/QTR1/master.zip to sec-index/2023-QTR1.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2022/QTR4/master.zip to sec-index/2022-QTR4.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2022/QTR3/master.zip to sec-index/2022-QTR3.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2022/QTR2/master.zip to sec-index/2022-QTR2.tsv
INFO:root:> downloaded https://www.sec.gov/Archives/edgar/full-index/2022/QTR1/master.zip to sec-index/2022-QTR1.tsv
INFO:root:> downloaded http

# New Section

In [None]:
# if no ciks are specified, load them from the available historical data
if not sec_ciks:
    print("abc")
    sec_ciks = load_ciks_from_historical_data()

# create list of sec files to download
data_frames = []
for file_name in os.listdir(index_directory):
    data_frame = pandas.read_csv(filepath_or_buffer=os.path.join(index_directory, file_name), sep='|', header=None,
                                 names=index_columns)
    logging.debug(f'DataFrame shape: {data_frame.shape}')

    # filter dataframes with specified CIKs and form types
    filtered_df = data_frame[data_frame['CIK'].isin(sec_ciks) & data_frame['type'].isin(sec_types)]

    data_frames.append(filtered_df)

sec_index_df = pandas.concat(data_frames)
logging.debug(f'Filtered DataFrame shape: {sec_index_df.shape}')
print(sec_index_df)

abc




In [None]:
# download sec files
files_directory = os.path.join(data_directory, "files")

# customize request headers with user-agent
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', user_agent)]
urllib.request.install_opener(opener)
print("abcdefgh")
for row in sec_index_df.itertuples():
    url = edgar_prefix_url + row.url_txt
    directory = os.path.join(files_directory, str(row.CIK))
    file = os.path.join(directory, f'{row.date}_{row.type}.txt')
    print(files_directory)
    if skip_present_files and os.path.exists(file):
        logging.debug("Skipping download, file already exists: %s" % file)
        continue

    # create cik directory, if needed
    pathlib.Path(directory).mkdir(parents=True, exist_ok=True)

    # start download
    start = get_millis()
    print("url is \n", url)
    print("filename is \n", file)
    urllib.request.urlretrieve(url=url, filename=file)
    elapsed = get_millis() - start
    logging.info("downloaded %s to %s in %dms" % (url, file, elapsed))
    if elapsed < 200:  # 200ms delay needed
        sleep_for = 200 - elapsed
        logging.info("sleeping for %dms because we are going too fast (previous request took %dms)", sleep_for, elapsed)
        time.sleep(sleep_for / 1000)