# SEC Data Collection

In [1]:
from pathlib import Path

KMB_CIK = "0000055785"
kmb_fp = Path("data") / f"CIK{KMB_CIK}.json"
kmb_fp

WindowsPath('data/CIK0000055785.json')

In [2]:
import json

with open(kmb_fp) as f:
    raw_data = json.load(f)

raw_data

{'cik': 55785,
 'entityName': 'KIMBERLY-CLARK CORPORATION',
 'facts': {'dei': {'EntityCommonStockSharesOutstanding': {'label': 'Entity Common Stock, Shares Outstanding',
    'description': "Indicate number of shares or other units outstanding of each of registrant's classes of capital or common stock or other ownership interests, if and as stated on cover of related periodic report. Where multiple classes or units exist define each class/interest by adding class of stock items such as Common Class A [Member], Common Class B [Member] or Partnership Interest [Member] onto the Instrument [Domain] of the Entity Listings, Instrument.",
    'units': {'shares': [{'end': '2009-07-31',
       'val': 414492692,
       'accn': '0000055785-09-000026',
       'fy': 2009,
       'fp': 'Q2',
       'form': '10-Q',
       'filed': '2009-08-07',
       'frame': 'CY2009Q2I'},
      {'end': '2009-10-30',
       'val': 415379458,
       'accn': '0000055785-09-000039',
       'fy': 2009,
       'fp': 'Q3',

In [3]:
from collections import defaultdict
outstanding_shares = raw_data["facts"]["dei"]["EntityCommonStockSharesOutstanding"]["units"]["shares"]
parsed = defaultdict(list)

for event in outstanding_shares:
    parsed["date"].append(event["end"])
    parsed["outstanding_shares"].append(event["val"])
    parsed["form"].append(event["form"])

In [4]:
diff_format = list()
simple_data = list()
for feat in raw_data["facts"]["us-gaap"].keys():
    try:
        feat_data = raw_data["facts"]["us-gaap"][feat]["units"]
        for unit in feat_data:
            parsed = defaultdict(list)

            for event in feat_data[unit]:
                parsed["period_end"].append(event["end"])
                parsed[feat].append(event["val"])
                parsed["form"].append(event["form"])
                parsed["filing_date"].append(event["filed"])

            simple_data.append(parsed)
    except Exception as e:
        print(e)
        diff_format.append(feat)

for feat in raw_data["facts"]["dei"].keys():
    try:
        feat_data = raw_data["facts"]["dei"][feat]["units"]
        for unit in feat_data:
            parsed = defaultdict(list)

            for event in feat_data[unit]:
                parsed["period_end"].append(event["end"])
                parsed[feat].append(event["val"])
                parsed["form"].append(event["form"])
                parsed["filing_date"].append(event["filed"])

            simple_data.append(parsed)
    except:
        diff_format.append(feat)

len(simple_data), len(diff_format)

(613, 0)

In [5]:
import pandas as pd
simple_data = [pd.DataFrame(d) for d in simple_data]

processed_dfs = []
for df in simple_data:
    # Identify the 'unique feature' column (the one that isn't a key)
    keys = {'period_end', 'filing_date', 'form'}
    unique_col = [c for c in df.columns if c not in keys][0]
    
    # Melt or rename so we can stack them consistently
    # It's often easier to just melt it into a 'value' and 'feature_name' column
    temp_df = df.melt(id_vars=['period_end', 'filing_date', 'form'], 
                      var_name='feat', 
                      value_name='value')
    processed_dfs.append(temp_df)

# 2. Concatenate vertically (Extremely fast)
long_df = pd.concat(processed_dfs, ignore_index=True)

# 3. Pivot to wide format (The "Join" equivalent)
final_df = long_df.pivot_table(
    index=['period_end', 'filing_date', 'form'],
    columns='feat',
    values='value'
).reset_index()
print(final_df.shape)
final_df.head(3)

(448, 605)


feat,period_end,filing_date,form,AccountsPayableCurrent,AccountsPayableTradeCurrent,AccountsReceivableNetCurrent,AccruedAdvertisingCurrent,AccruedIncomeTaxesCurrent,AccruedIncomeTaxesNoncurrent,AccruedLiabilitiesCurrent,...,UnrecordedUnconditionalPurchaseObligationDueAfterFiveYears,ValuationAllowanceDeferredTaxAssetChangeInAmount,ValuationAllowancesAndReservesBalance,ValuationAllowancesAndReservesChargedToCostAndExpense,ValuationAllowancesAndReservesDeductions,ValuationAllowancesAndReservesPeriodIncreaseDecrease,VariableLeaseCost,WeightedAverageNumberOfDilutedSharesOutstanding,WeightedAverageNumberOfSharesIssuedBasic,WeightedAverageNumberOfSharesOutstandingBasic
0,2006-12-31,2010-02-24,10-K,,,,,,,,...,,,,,,,,,,
1,2007-12-31,2009-08-07,10-Q,,,,,,,,...,,,,,,,,,,
2,2007-12-31,2009-11-06,10-Q/A,,,,,,,,...,,,,,,,,,,


In [6]:
final_df.to_csv(Path("cleaned") / "KMB.csv", index=False)

In [7]:
print(f"{(1 - (final_df.isna().sum().sum() / final_df.size)) * 100:.2f}% useable datapoints")
print(final_df.size - final_df.isna().sum().sum(), "datapoints available")

8.17% useable datapoints
22136 datapoints available
