In [1]:
# Prelimenary Imports and ENV variable definitions
import csv
import os
import yfinance as yf

from dotenv import load_dotenv
from pprint import pprint

load_dotenv()
FILE_PATH = r"./dataset/" 


CIK_IDENTIFIERS = [
    '0001720792',
    '0001099281',
    '0001079114',
    '0001112520',
    '0001641864',
    '0000846222',
    '0001709323',
    '0000732905',
    '0000883965',
    '0001067983',
    '0001061768',
]

Downloading neccessary files, a lot of py magic here.

In [2]:
import requests
import zipfile
from io import BytesIO
from datetime import datetime

# Empty out directory
files = [filename for filename in os.listdir(FILE_PATH) if not filename.startswith("README")]
for file in files:
    os.remove(FILE_PATH+file)

month = datetime.now().month
quarter = 4 if int(month/4) == 0 else int(month/4)
print(quarter)
year = datetime.now().year

fmonth = month-1
fyear = year if fmonth != 0 else year-1
if fmonth == 0:
    fmonth = '12'
elif fmonth < 10:
    fmonth = '0' + str(fmonth)



fails_deliver_url = f'https://www.sec.gov/files/data/fails-deliver-data/cnsfails{fyear}{fmonth}a.zip'
url_high_div_etf = 'https://www.blackrock.com/us/individual/products/239563/ishares-high-dividend-etf/1464253357814.ajax?fileType=csv&fileName=HDV_holdings&dataType=fund'
url_core_div_etf = 'https://www.ishares.com/us/products/291387/fund/1467271812596.ajax?fileType=csv&fileName=DIVB_holdings&dataType=fund'
headers = {
    'Host': 'www.sec.gov', 'Connection': 'close',
    'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}

r = requests.get(url_high_div_etf, allow_redirects=True)
with open(FILE_PATH + 'HDV_holdings.csv', 'wb+') as f:
    f.write(r.content)

r = requests.get(url_core_div_etf, allow_redirects=True)
with open(FILE_PATH + 'DIVB_holdings.csv', 'wb+') as f:
    f.write(r.content)

with open(FILE_PATH + 'HDV_holdings.csv', 'r', encoding='utf-8-sig') as fp:
    lines = fp.readlines()

with open(FILE_PATH + 'HDV_holdings.csv', 'w', encoding='utf-8-sig') as fp:
    for i, line in enumerate(lines):
        if i<9: continue
        fp.write(line)

with open(FILE_PATH + 'DIVB_holdings.csv', 'r', encoding='utf-8-sig') as fp:
    lines = fp.readlines()

with open(FILE_PATH + 'DIVB_holdings.csv', 'w', encoding='utf-8-sig') as fp:
    for i, line in enumerate(lines):
        if i<9: continue
        fp.write(line)

r = requests.get(fails_deliver_url, headers=headers, allow_redirects=True)
z = zipfile.ZipFile(BytesIO(r.content))
z.extract(f'cnsfails{fyear}{fmonth}a', FILE_PATH)


if quarter == 4: # q4 comes out in the new year so most recent data will be in last year for the 13f data.
    year -= 1

files = 3 # Last {files} quarters of 13f data
i = 0 # iterator not pythonic but might be mutated
while i < files:
    
    if i > 0:
        quarter -= 1
        if quarter <= 0:
            quarter = 4
            year -= 1
            
    url = f'https://www.sec.gov/files/structureddata/data/form-13f-data-sets/{year}q{quarter}_form13f.zip'
    try:
        r = requests.get(url, headers=headers, allow_redirects=True)
        z = zipfile.ZipFile(BytesIO(r.content))
        zipinfos = z.infolist()
        for zipinfo in zipinfos:
            if "INFOTABLE" in zipinfo.filename:
                zipinfo.filename = f'INFOTABLE_{year}_q{quarter}.tsv'
                z.extract(zipinfo, FILE_PATH)
            elif "SUBMISSION" in zipinfo.filename:
                zipinfo.filename = f'SUBMISSION_{year}_q{quarter}.tsv'
                z.extract(zipinfo, FILE_PATH)
    except zipfile.BadZipFile:
        print("13f of specific quarter not present") 
        files += 1 # Get next most recent
    i+=1
        

4


BadZipFile: File is not a zip file

From the SUBMISSION table fetch a list of ACCESSION_NUMBER(s) using the CIK identifiers in table A-1 (Appendix).




In [None]:
picked_submissions = []

prefixed = [filename for filename in os.listdir(FILE_PATH) if filename.startswith("SUBMISSION")]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file, 'r', encoding='utf-8') as q:
        for submission in csv.DictReader(q, delimiter="\t"):
            if submission["CIK"] in CIK_IDENTIFIERS:
                picked_submissions.append(submission["ACCESSION_NUMBER"])

pprint(len(picked_submissions))


From the INFOTABLE fetch a list of NAMEOFISSUER(s) using the ACCESSION_NUMBER(s) created in (b). Use CUSIP(s) to map between brokers since it is unique where names differ slightly.

In [None]:
names_of_issuers = set()

prefixed = [filename for filename in os.listdir(FILE_PATH) if filename.startswith("INFOTABLE")]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file, 'r', encoding='utf-8') as q:
        for entry in csv.DictReader(q, delimiter="\t"):
            if entry["ACCESSION_NUMBER"] in picked_submissions:
                names_of_issuers.add(entry["CUSIP"].upper())

Now we need to convert the CUSIP to tickers, we will do this using the polygon API to fetch info about a holding by it's CUSIP ID. 

Simply download the last 2 most recent file from https://www.sec.gov/data/foiadocsfailsdatahtm and store in dataset folder.

In this step we lose about 12% of the dataset... Unsure if there is a better way to resolve this

In [None]:
tickers = set()

prefixed = [filename for filename in os.listdir(FILE_PATH) if filename.startswith("cnsfail")]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file,'r') as f:
        for entry in csv.DictReader(f, delimiter="|"):
            if entry['CUSIP'] in names_of_issuers: 
                tickers.add(entry['SYMBOL'])
                names_of_issuers.remove(entry['CUSIP'])
    
pprint(tickers)

Adding the rest of the ticker symbols to the set from the other datasets.

Before anything delete the first 10 rows of the csv files {DIVB_holdings, HDV_holdings} as it messed up the parsing for DictReader.

In [None]:
prefixed = [filename for filename in os.listdir(FILE_PATH) if "holdings" in filename]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file,'r', encoding='utf-8-sig') as f:
        for entry in csv.DictReader(f, delimiter=","):
            entry.keys()
            tickers.add(entry["Ticker"])
        
pprint(tickers)

From set A, remove all tickers that do not offer dividends

In [None]:
from requests import HTTPError


arr_A = list(tickers)

ticker_objs = list(yf.Tickers(arr_A).tickers.values())
arr_B = []
for ticker in ticker_objs:
    try:
        if 'dividendRate' in ticker.info.keys():
            arr_B.append(ticker.info["symbol"])
    except HTTPError:
        print(f"Ticker not found, removed from subset.")
        continue

pprint(arr_B)

From subset (b) remove all names that have a high business risk, a debt to equity ratio greater than 1.5, sub-subset (c)

In [None]:
ticker_objs = list(yf.Tickers(arr_B).tickers.values())


arr_C = []

for ticker in ticker_objs:
    try:
        balance_sheet = list(ticker.balancesheet.to_dict().values())[0] # get most recent data
        liabilities = balance_sheet['Total Liabilities Net Minority Interest']
        assets = balance_sheet['Total Assets']
        debtToEquity = abs( liabilities / (assets - liabilities) )
    except ZeroDivisionError:
        print(ticker.info["symbol"]) # if this is close to 0 then equity to debt ratio is near inf 
        continue                     # So we skip it.
    except KeyError:
        print('Missing Balance Sheet Info')
        print(ticker.info["symbol"])
    if debtToEquity <= 1.5:
        arr_C.append(ticker.info["symbol"])

print(len(arr_C))
pprint(arr_C)
    



Store subset into a file for the cfs module to reference.

In [None]:
with open(r'../cfs_module/subset_c.txt', 'w') as f:
    f.write('\n'.join(arr_C))