In [1]:
# Using the company facts API, we can get a ton of data about many companies
# But tags aren't always the same across companies, and can vary a lot
# Let's analyze this

In [2]:
import pandas as pd
import os
import scraper
import json

In [3]:
data_dir = f"data/{os.listdir("data")[-1]}" # most recent scrape

In [4]:
comp_dict = scraper.get_companies_dict()

In [5]:
# ------------ Analyzing GAAP tags ----------------

In [6]:
# First, let's check what types of taxonomies / fact categories there are
# We will put them in a dictionary to count their occurences

taxonomies_dict = {}

for cik in os.listdir(data_dir):
    comp = comp_dict[cik]
    if not os.path.exists(f"{data_dir}/{cik}/companyfacts.json"): continue

    with open(f"{data_dir}/{cik}/companyfacts.json", "r") as file:
        content = file.read()
        if content == "" or content == "\n": continue

        facts = json.loads(content)
        for taxon in facts["facts"].keys():
            if taxon not in taxonomies_dict: taxonomies_dict[taxon] = 1
            else: taxonomies_dict[taxon] += 1

In [7]:
taxonomies_dict

{'dei': 454, 'us-gaap': 415, 'invest': 130, 'srt': 162, 'ifrs-full': 90}

In [8]:
# dei taxonomy: "Document and Entity Information". Usually only conaints 1-2 things like number of shares.
# us-gaap taxonomy: the standard, probably contains the most data
# invest taxonomy: Seems to barely have anything.
# srt taxonomy: SEC Reporting Taxonomy. Seems to barely have anything.
# ifrs-full taxonomy: Another kind of taxonomy. Seems big, but not very popular.

In [9]:
# Now let's look at tag frequency.
# I will prepend dei: to dei tags and us-gaap: to us-gaap tags

tags_dict = {}
num_comps = 0

for cik in os.listdir(data_dir):
    comp = comp_dict[cik]
    if not os.path.exists(f"{data_dir}/{cik}/companyfacts.json"): continue

    with open(f"{data_dir}/{cik}/companyfacts.json", "r") as file:
        content = file.read()
        if content == "" or content == "\n": continue
        facts = json.loads(content)

        num_comps += 1
        for taxon in facts["facts"]:
            for tag in facts["facts"][taxon]:
                tag_name = taxon + ":" + tag
                if tag_name not in tags_dict: tags_dict[tag_name] = 1
                else: tags_dict[tag_name] += 1

In [10]:
# Now show the results, but sorted by occurences decreasing
print(f"Number of companies checked is {num_comps}")
sorted(tags_dict.items(), key = lambda tup: tup[1], reverse=True)

Number of companies checked is 470


[('dei:EntityCommonStockSharesOutstanding', 427),
 ('us-gaap:Assets', 397),
 ('us-gaap:NetCashProvidedByUsedInOperatingActivities', 397),
 ('us-gaap:LiabilitiesAndStockholdersEquity', 396),
 ('us-gaap:NetCashProvidedByUsedInFinancingActivities', 394),
 ('us-gaap:NetCashProvidedByUsedInInvestingActivities', 393),
 ('us-gaap:IncomeTaxExpenseBenefit', 392),
 ('us-gaap:NetIncomeLoss', 391),
 ('us-gaap:EarningsPerShareBasic', 387),
 ('us-gaap:WeightedAverageNumberOfSharesOutstandingBasic', 387),
 ('us-gaap:CashAndCashEquivalentsAtCarryingValue', 386),
 ('us-gaap:StockholdersEquity', 386),
 ('us-gaap:AccumulatedOtherComprehensiveIncomeLossNetOfTax', 385),
 ('us-gaap:ComprehensiveIncomeNetOfTax', 384),
 ('us-gaap:EarningsPerShareDiluted', 382),
 ('us-gaap:RetainedEarningsAccumulatedDeficit', 381),
 ('us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding', 381),
 ('us-gaap:Goodwill', 373),
 ('dei:EntityPublicFloat', 371),
 ('us-gaap:PropertyPlantAndEquipmentNet', 371),
 ('us-gaap:OperatingLe

In [11]:
# Observations:
    # The main issue is how spread out and messed up the tags are
    # e.g. some companies, instead of using us-gaap:<tag>, may use a combination of 
    # us-gaap:<subtag> which add up to <tag>
    # Similarly, some companies don't have the data in us-gaap but do have it in ifrs-full

    # So I need to find ways to clean up the data and make the data more consistent

In [None]:
# TODO:
    # GROUP TAGS! Then find relationships between them:
        # Check if one tags are aliases (e.g. us-gaap: Assets and ifrs-full:Assets)
        # Check if some tags can be calculated from each other / can be reconstructed

In [None]:
# Ideas:
    # Alises: ifrs-full:Assets and us-gaap:Assets
    # Aliases: dei:EntityCommonStockSharesOutstanding and us-gaap:CommonStockSharesOutstanding
    # Group: us-gaap:NetCashProvidedByUsedIn...
        # Is this related to ifrs-full:CashFlowsFromUsedIn?
    # Group: all the liabilities stuff.
        # us-gaap:LiabilitiesAndStockholdersEquity
        # us-gaap:LiabilitiesCurrent
        # us-gaap:Liabilities
        # Many nmore us-gaap ones
        # ifrs-full:DeferredTaxLiabilities
        # And some more ifrs ones
    # There's also a ton of tags found under IncomeLoss
    # Just keep grouping until you run out of tags