In [1]:
# Prelimenary Imports and ENV variable definitions
import csv
import os
import yfinance as yf

from dotenv import load_dotenv
from pprint import pprint

load_dotenv()
FILE_PATH = r"./dataset/" 


CIK_IDENTIFIERS = [
    '0001720792',
    '0001099281',
    '0001079114',
    '0001112520',
    '0001641864',
    '0000846222',
    '0001709323',
    '0000732905',
    '0000883965',
    '0001067983',
    '0001061768',
]

From the SUBMISSION table fetch a list of ACCESSION_NUMBER(s) using the CIK identifiers in table A-1 (Appendix).




In [12]:
picked_submissions = []

prefixed = [filename for filename in os.listdir(FILE_PATH) if filename.startswith("SUBMISSION")]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file, 'r', encoding='utf-8') as q:
        for submission in csv.DictReader(q, delimiter="\t"):
            if submission["CIK"] in CIK_IDENTIFIERS:
                picked_submissions.append(submission["ACCESSION_NUMBER"])

pprint(len(picked_submissions))


['SUBMISSION_2023_q1.tsv', 'SUBMISSION_2023_q2.tsv', 'SUBMISSION_2023_q3.tsv']
35


From the INFOTABLE fetch a list of NAMEOFISSUER(s) using the ACCESSION_NUMBER(s) created in (b). Use CUSIP(s) to map between brokers since it is unique where names differ slightly.

In [13]:
names_of_issuers = set()

prefixed = [filename for filename in os.listdir(FILE_PATH) if filename.startswith("INFOTABLE")]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file, 'r', encoding='utf-8') as q:
        for entry in csv.DictReader(q, delimiter="\t"):
            if entry["ACCESSION_NUMBER"] in picked_submissions:
                names_of_issuers.add(entry["CUSIP"].upper())

['INFOTABLE_2023_q1.tsv', 'INFOTABLE_2023_q2.tsv', 'INFOTABLE_2023_q3.tsv']


KeyboardInterrupt: 

Now we need to convert the CUSIP to tickers, we will do this using the polygon API to fetch info about a holding by it's CUSIP ID. 

Simply download the last 2 most recent file from https://www.sec.gov/data/foiadocsfailsdatahtm and store in dataset folder.

In this step we lose about 12% of the dataset... Unsure if there is a better way to resolve this

In [9]:
tickers = set()

prefixed = [filename for filename in os.listdir(FILE_PATH) if filename.startswith("cnsfail")]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + file,'r') as f:
        for entry in csv.DictReader(f, delimiter="|"):
            if entry['CUSIP'] in names_of_issuers: 
                tickers.add(entry['SYMBOL'])
                names_of_issuers.remove(entry['CUSIP'])
    
pprint(tickers)

['cnsfails202310b', 'cnsfails202311a']
set()


Adding the rest of the ticker symbols to the set from the other datasets.

Before anything delete the frist 10 rows of the csv files {DIVB_holdings, HDV_holdings} as it messed up the parsing for DictReader.

In [5]:
prefixed = [filename for filename in os.listdir(FILE_PATH) if "holdings" in filename]
print(prefixed)

for file in prefixed:
    with open(FILE_PATH + 'DIVB_holdings.csv','r', encoding='utf-8-sig') as f:
        for entry in csv.DictReader(f, delimiter=","):
            entry.keys()
            tickers.add(entry["Ticker"])
        
pprint(tickers)

['DIVB_holdings.csv', 'HDV_holdings.csv']
{'A',
 'AAP',
 'AAPL',
 'ABBV',
 'ACIW',
 'ACN',
 'ACT',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADT',
 'ADV',
 'AEE',
 'AEO',
 'AEP',
 'AES',
 'AFL',
 'AGNC',
 'AIG',
 'AIRC',
 'AIZ',
 'ALG',
 'ALL',
 'ALLY',
 'ALRM',
 'ALSN',
 'ALV',
 'AM',
 'AMCR',
 'AME',
 'AMGN',
 'AMH',
 'AMP',
 'AMT',
 'AMZN',
 'ANET',
 'ANGI',
 'AON',
 'AOS',
 'APA',
 'APAM',
 'APD',
 'APLE',
 'APO',
 'ARCH',
 'ARES',
 'ARW',
 'ASAI',
 'ASB',
 'ASH',
 'ATAI',
 'ATHM',
 'ATRA',
 'ATUS',
 'ATVI',
 'AUB',
 'AVB',
 'AVGO',
 'AVNT',
 'AVT',
 'AXP',
 'AXS',
 'AXTA',
 'AZO',
 'BABA',
 'BAC',
 'BAM',
 'BAX',
 'BC',
 'BEN',
 'BG',
 'BHF',
 'BIDU',
 'BIL',
 'BK',
 'BKH',
 'BKR',
 'BKSYWS',
 'BLDR',
 'BLK',
 'BLMN',
 'BMY',
 'BN',
 'BODYWS',
 'BOKF',
 'BOOT',
 'BPOP',
 'BRKB',
 'BRX',
 'BX',
 'BXMT',
 'BXP',
 'C',
 'CABO',
 'CACC',
 'CADE',
 'CAG',
 'CAH',
 'CALM',
 'CAT',
 'CATY',
 'CB',
 'CBRE',
 'CBRL',
 'CBSH',
 'CC',
 'CCF',
 'CCI',
 'CCK',
 'CCOI',
 'CCRD',
 'CDP',
 'CE',
 'CEIX'

From set A, remove all tickers that do not offer dividends

In [6]:
from requests import HTTPError


arr_A = list(tickers)

ticker_objs = list(yf.Tickers(arr_A).tickers.values())
arr_B = []
for ticker in ticker_objs:
    try:
        if 'dividendRate' in ticker.info.keys():
            arr_B.append(ticker.info["symbol"])
    except HTTPError:
        print(f"Ticker not found, removed from subset.")
        continue

pprint(arr_B)

Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
Ticker not found, removed from subset.
['IDA',
 'HWC',
 'MUR',
 'IBM',
 'CAT',
 'AMCR',
 'ADI',
 'SPGI',
 'GPK',
 'USB',
 'SITC',
 'GEN',
 'APO',
 'DG',
 'CNS',
 'MDLZ',
 'PXD',
 'ORCL',
 'AIRC',
 'MMM',
 'PG',
 'GLW',
 'CBRL',
 'OGS',
 'CUZ',
 'UNP',
 'SWKS',
 'HI',
 'EWBC',
 'CINF',
 'HIG',
 'DPZ',
 'PRU',
 'THO',
 'KW',
 'MAS',
 'IBOC',
 'KIM',
 'DTE',
 'KNX',
 'PNM',
 'PSX',
 'CNP',
 'BN',
 'GILD',
 'HII',
 'OHI',
 'RMR',
 'MSFT',
 'PEP',
 'DUK',
 'FNB',
 'X',
 'T',
 'OUT',
 'A',
 'HBB',
 'PPL',
 'PCH',
 'LIN',
 'CWEN',
 'BC',
 'VTS',
 'CAH',
 'FBIN',
 'J',
 'ARCH',
 'WD',
 'DOC',
 'FTI',
 'ABBV',
 'RYN',
 'SCCO',
 'OZK',
 'GD',
 'VRSK',

From subset (b) remove all names that have a high business risk, a debt to equity ratio greater than 1.5, sub-subset (c)

In [7]:
ticker_objs = list(yf.Tickers(arr_B).tickers.values())


arr_C = []

for ticker in ticker_objs:
    balance_sheet = list(ticker.balancesheet.to_dict().values())[0] # get most recent data
    liabilities = balance_sheet['Total Liabilities Net Minority Interest']
    assets = balance_sheet['Total Assets']
    try:
        debtToEquity = abs( liabilities / (assets - liabilities) )
    except ZeroDivisionError:
        print(ticker.info["symbol"]) # if this is close to 0 then equity to debt ratio is near inf 
        continue                     # So we skip it.
    if debtToEquity <= 1.5:
        arr_C.append(ticker.info["symbol"])

print(len(arr_C))
pprint(arr_C)
    



AEP
CVX
PFG
FAF
MCO
208
['MUR',
 'ADI',
 'SPGI',
 'SITC',
 'CNS',
 'PXD',
 'GLW',
 'CUZ',
 'SWKS',
 'DPZ',
 'THO',
 'KIM',
 'KNX',
 'PSX',
 'OHI',
 'RMR',
 'MSFT',
 'X',
 'A',
 'PCH',
 'LIN',
 'VTS',
 'J',
 'ARCH',
 'WD',
 'DOC',
 'RYN',
 'SCCO',
 'KHC',
 'MRK',
 'GOLF',
 'DVN',
 'PKG',
 'APD',
 'EOG',
 'NHI',
 'JHG',
 'OSK',
 'VMC',
 'ATHM',
 'MRO',
 'WOR',
 'HLNE',
 'STC',
 'LHX',
 'ESS',
 'TMO',
 'PEAK',
 'IVZ',
 'VLO',
 'KOF',
 'MSM',
 'QCOM',
 'FAST',
 'DGX',
 'INTC',
 'ASH',
 'INGR',
 'TAP',
 'LH',
 'PK',
 'ESNT',
 'HST',
 'CDP',
 'XOM',
 'CIVI',
 'REG',
 'ACT',
 'INVH',
 'WSO',
 'WY',
 'JNPR',
 'PARA',
 'DHR',
 'FIS',
 'PHM',
 'TXN',
 'BG',
 'ADM',
 'FOXA',
 'NVS',
 'HIW',
 'MGY',
 'ROIC',
 'LPX',
 'SUI',
 'FTV',
 'SBRA',
 'PINC',
 'COP',
 'FHI',
 'SHOO',
 'RHI',
 'MDC',
 'CTO',
 'CEIX',
 'TGNA',
 'VTR',
 'DBRG',
 'JCI',
 'HLN',
 'PSA',
 'CRK',
 'APLE',
 'KDP',
 'PARAA',
 'SNA',
 'CF',
 'EQR',
 'KRC',
 'OWL',
 'EPD',
 'CSL',
 'AMH',
 'MOS',
 'HRL',
 'SJM',
 'CNXC',
 'WPC',
 'PRG

Store subset into a file for the cfs module to reference.

In [8]:
with open(r'../cfs_module/subset_c.txt', 'w') as f:
    f.write('\n'.join(arr_C))