In [1]:
# Using the company facts API, we can get a ton of data about many companies
# But tags aren't always the same across companies, and can vary a lot
# Let's analyze this

In [None]:
import scraper
import pandas as pd

In [3]:
companies = scraper.get_companies()

In [4]:
facts = companies[0].get_facts() # example sets of facts for the first company

In [None]:
# There is a dataframe for the tags and their descriptions
# Then there is a dataframe for each tag, containing the data
# There is a column for each unit (usually only 1, but there can be more)

tag_info = []
tags_data = {} # tags_data[tag][unit] gives a df for that unit

for gaap_tag in facts["facts"]["us-gaap"]:
    tag_info.append([gaap_tag, facts["facts"]["us-gaap"][gaap_tag]["description"]])

    tags_data[gaap_tag] = {}
    for unit in facts["facts"]["us-gaap"][gaap_tag]["units"].keys():

        # Only keep keys that are present across all records of this gaap tag
        consistent_keys = set() # for some reason, the keys aren't always consistent
        for rec in facts["facts"]["us-gaap"][gaap_tag]["units"][unit]:
            if len(consistent_keys) == 0:
                for key in rec.keys():
                    consistent_keys.add(key)
            else:
                new_keys = set()
                for key in consistent_keys:
                    if key in rec.keys():
                        new_keys.add(key)
                consistent_keys = new_keys

        data = []
        for rec in facts["facts"]["us-gaap"][gaap_tag]["units"][unit]:
            data.append([rec[key] for key in consistent_keys])
        tags_data[gaap_tag][unit] = pd.DataFrame(data, columns=list(consistent_keys)) 
           
tags_df = pd.DataFrame(tag_info, columns=["name", "description"])    

In [None]:
# Notes
    # for some CIK's, you will get request errors
        # This is expected: e.g. an s&p 500 trust filer doesn't have a company facts api
    # The consistency of tags vary considerably!
        # Things like "Assets" are super popular, but consistency drops off steadily
        # For about 90% consistency, you get 42 tags
        # I willl need to decide on a "level of consistency": what tags do i want?
        # Do i want companies with all of those tags? (will be much less)
        # Alt: get tag data individually (though this complicates things considerably)

In [None]:
tags_counter = {}

In [None]:
done_companies = 600

for i in range(done_companies + 500):
    try:
        facts = companies[i].get_facts()
        for tag in facts["facts"]["us-gaap"]:
            if tag not in tags_counter:
                tags_counter[tag] = 1
            else:
                tags_counter[tag] += 1
    except:
        pass


In [None]:
sorted(list(tags_counter.items()), key = lambda val: val[1], reverse=True)
# 600 companies, 32 no facts (so only 568)


[('Assets', 566),
 ('LiabilitiesAndStockholdersEquity', 565),
 ('NetCashProvidedByUsedInOperatingActivities', 565),
 ('NetCashProvidedByUsedInFinancingActivities', 563),
 ('IncomeTaxExpenseBenefit', 562),
 ('NetCashProvidedByUsedInInvestingActivities', 562),
 ('NetIncomeLoss', 559),
 ('EarningsPerShareBasic', 554),
 ('WeightedAverageNumberOfSharesOutstandingBasic', 553),
 ('AccumulatedOtherComprehensiveIncomeLossNetOfTax', 551),
 ('CashAndCashEquivalentsAtCarryingValue', 551),
 ('ComprehensiveIncomeNetOfTax', 551),
 ('EarningsPerShareDiluted', 548),
 ('StockholdersEquity', 547),
 ('WeightedAverageNumberOfDilutedSharesOutstanding', 546),
 ('RetainedEarningsAccumulatedDeficit', 545),
 ('CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents', 535),
 ('OperatingLeaseLiability', 534),
 ('OperatingLeaseRightOfUseAsset', 533),
 ('Goodwill', 530),
 ('PropertyPlantAndEquipmentNet', 530),
 ('CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExcha