In [1]:
# Using the company facts API, we can get a ton of data about many companies
# But tags aren't always the same across companies, and can vary a lot
# Let's analyze this

In [2]:
import pandas as pd
import os
import scraper
import json

In [3]:
data_dir = f"data/{os.listdir("data")[-1]}" # most recent scrape

In [4]:
comp_dict = scraper.get_companies_dict()

In [5]:
# ------------ Analyzing GAAP tags ----------------

In [6]:
# First, let's check what types of taxonomies / fact categories there are
# We will put them in a dictionary to count their occurences

taxonomies_dict = {}

for cik in os.listdir(data_dir):
    comp = comp_dict[cik]
    if not os.path.exists(f"{data_dir}/{cik}/companyfacts.json"): continue

    with open(f"{data_dir}/{cik}/companyfacts.json", "r") as file:
        content = file.read()
        if content == "" or content == "\n": continue

        facts = json.loads(content)
        for taxon in facts["facts"].keys():
            if taxon not in taxonomies_dict: taxonomies_dict[taxon] = 1
            else: taxonomies_dict[taxon] += 1

In [7]:
taxonomies_dict

{'dei': 454, 'us-gaap': 415, 'invest': 130, 'srt': 162, 'ifrs-full': 90}

In [8]:
# dei taxonomy: "Document and Entity Information". Usually only conaints 1-2 things like number of shares.
# us-gaap taxonomy: the standard, probably contains the most data
# invest taxonomy: Seems to barely have anything.
# srt taxonomy: SEC Reporting Taxonomy. Seems to barely have anything.
# ifrs-full taxonomy: Another kind of taxonomy. Seems big, but not very popular.

In [None]:
# I will use us-gaap and dei, and discard the others. 
# Now let's look at tag frequency.
# I will prepend dei: to dei tags and us-gaap: to us-gaap tags

tags_dict = {}
num_comps = 0

for cik in os.listdir(data_dir):
    comp = comp_dict[cik]
    if not os.path.exists(f"{data_dir}/{cik}/companyfacts.json"): continue

    with open(f"{data_dir}/{cik}/companyfacts.json", "r") as file:
        content = file.read()
        if content == "" or content == "\n": continue
        facts = json.loads(content)
        if "us-gaap" not in facts["facts"] or "dei" not in facts["facts"]: continue

        num_comps += 1
        for taxon in ("us-gaap", "dei"):
            for tag in facts["facts"][taxon]:
                tag_name = taxon + ":" + tag
                if tag_name not in tags_dict: tags_dict[tag_name] = 1
                else: tags_dict[tag_name] += 1

0000009631
0000009631
0000311337
0000311337
0000811809
0000811809
0000835403
0000835403
0000839923
0000839923
0000842180
0000842180
0000879764
0000879764
0000891478
0000891478
0001002242
0001002242
0001039765
0001039765
0001116578
0001116578
0001140625
0001140625
0001160106
0001160106
0001303523
0001303523
0001475260
0001475260
0001697862
0001697862
0001776985
0001776985


In [13]:
# Now show the results, but sorted by occurences decreasing
print(f"Number of companies checked is {num_comps}")
sorted(tags_dict.items(), key = lambda tup: tup[1], reverse=True)

Number of companies checked is 409


[('us-gaap:Assets', 392),
 ('us-gaap:NetCashProvidedByUsedInOperatingActivities', 392),
 ('us-gaap:LiabilitiesAndStockholdersEquity', 391),
 ('us-gaap:NetCashProvidedByUsedInFinancingActivities', 389),
 ('us-gaap:NetCashProvidedByUsedInInvestingActivities', 388),
 ('us-gaap:IncomeTaxExpenseBenefit', 387),
 ('us-gaap:NetIncomeLoss', 386),
 ('dei:EntityCommonStockSharesOutstanding', 383),
 ('us-gaap:EarningsPerShareBasic', 382),
 ('us-gaap:WeightedAverageNumberOfSharesOutstandingBasic', 382),
 ('us-gaap:CashAndCashEquivalentsAtCarryingValue', 381),
 ('us-gaap:StockholdersEquity', 381),
 ('us-gaap:AccumulatedOtherComprehensiveIncomeLossNetOfTax', 380),
 ('us-gaap:ComprehensiveIncomeNetOfTax', 379),
 ('us-gaap:EarningsPerShareDiluted', 377),
 ('us-gaap:RetainedEarningsAccumulatedDeficit', 376),
 ('us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding', 376),
 ('dei:EntityPublicFloat', 371),
 ('us-gaap:Goodwill', 369),
 ('us-gaap:PropertyPlantAndEquipmentNet', 366),
 ('us-gaap:OperatingLe

In [None]:
# Observations:
    # The main issue is how spread out and messed up the tags are
    # e.g. some companies, instead of using us-gaap:<tag>, may use a combination of 
    # us-gaap:<subtag> which add up to <tag>
    # Similarly, some companies don't have the data in us-gaap but do have it in ifrs-full

In [None]:
# Perhaps decide on a subset of things you want to calculate
# e.g. revenue, p/e, p/e/g, profit, etc.

In [None]:
# Now the real question is: how should I proceed?
# TODO: First things first: I have to clean the data
    # Check if there are some tags that should or can be classed as other tags
    # Could you check if some tags tend to have similar values? Or have some similar relationship?