In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import gcsfs

In [10]:
# Read data from a .txt file into a DataFrame
# base_dir = Path("gs://sec-financials-edgar/sec_edgar_financials")
# year = 2009
# quarter = 1
# filepath = base_dir / f"{year}q{quarter}/num.txt"

In [None]:
fs = gcsfs.GCSFileSystem()

path = "gs://sec-financials-edgar/sec_edgar_financials/2009q1/sub.txt"

with fs.open(path, "r") as f:
    df = pd.read_csv(f, sep="\t", header=0, dtype=str)

In [17]:
df.head(5)

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks


In [19]:
import pandas as pd

BASE = "gs://sec-financials-edgar/sec_edgar_financials/2010q1"

paths = {
    "sub": f"{BASE}/sub.txt",
    "num": f"{BASE}/num.txt",
    "tag": f"{BASE}/tag.txt",
    "pre": f"{BASE}/pre.txt",
}

In [20]:
sub = pd.read_csv(paths["sub"], sep="\t", low_memory=False)
num = pd.read_csv(paths["num"], sep="\t", low_memory=False)
tag = pd.read_csv(paths["tag"], sep="\t", low_memory=False)
pre = pd.read_csv(paths["pre"], sep="\t", low_memory=False)

In [21]:
print("SUB:", sub.shape)
print("NUM:", num.shape)
print("TAG:", tag.shape)
print("PRE:", pre.shape)

SUB: (495, 36)
NUM: (194741, 10)
TAG: (9699, 9)
PRE: (64151, 10)


In [23]:
sub.columns.tolist()

['adsh',
 'cik',
 'name',
 'sic',
 'countryba',
 'stprba',
 'cityba',
 'zipba',
 'bas1',
 'bas2',
 'baph',
 'countryma',
 'stprma',
 'cityma',
 'zipma',
 'mas1',
 'mas2',
 'countryinc',
 'stprinc',
 'ein',
 'former',
 'changed',
 'afs',
 'wksi',
 'fye',
 'form',
 'period',
 'fy',
 'fp',
 'filed',
 'accepted',
 'prevrpt',
 'detail',
 'instance',
 'nciks',
 'aciks']

In [24]:
num.columns.tolist()

['adsh',
 'tag',
 'version',
 'ddate',
 'qtrs',
 'uom',
 'segments',
 'coreg',
 'value',
 'footnote']

In [25]:
tag.columns.tolist()


['tag',
 'version',
 'custom',
 'abstract',
 'datatype',
 'iord',
 'crdr',
 'tlabel',
 'doc']

In [26]:
pre.columns.tolist()

['adsh',
 'report',
 'line',
 'stmt',
 'inpth',
 'rfile',
 'tag',
 'version',
 'plabel',
 'negating']

In [27]:
sample_adsh = sub["adsh"].iloc[0]
sample_adsh

'0001104659-10-006102'

In [28]:
num[num["adsh"] == sample_adsh].head(20)

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,segments,coreg,value,footnote
308,0001104659-10-006102,IncomeTaxExpenseBenefit,us-gaap/2009,20081231,4,USD,,,236700000.0,
591,0001104659-10-006102,EarningsPerShareBasic,us-gaap/2009,20071231,4,USD,,,8.46,
1359,0001104659-10-006102,OtherAssetsNoncurrent,us-gaap/2009,20081231,0,USD,,,444500000.0,
2907,0001104659-10-006102,MachineryAndEquipmentGross,us-gaap/2009,20091231,0,USD,,,4771600000.0,
2920,0001104659-10-006102,MinorityInterestDecreaseFromDistributionsToNon...,us-gaap/2009,20081231,4,USD,,,49600000.0,
3736,0001104659-10-006102,IncreaseDecreaseInNoncurrentOperatingAssets,0001104659-10-006102,20081231,4,USD,,,-1400000.0,
3982,0001104659-10-006102,IncomeLossFromContinuingOperationsPerDilutedShare,us-gaap/2009,20071231,4,USD,,,1.78,
4847,0001104659-10-006102,CashAndCashEquivalentsPeriodIncreaseDecrease,us-gaap/2009,20081231,4,USD,,,-8200000.0,
5079,0001104659-10-006102,OtherPostretirementDefinedBenefitPlanLiabiliti...,us-gaap/2009,20081231,0,USD,,,239700000.0,
5750,0001104659-10-006102,TransportationOfficeAndMiscellaneousEquipmentG...,0001104659-10-006102,20091231,0,USD,,,131100000.0,


In [31]:
def txt_to_parquet(txt_path: str, parquet_path: str):
    df = pd.read_csv(
        txt_path,
        sep="\t",
        low_memory=False
    )

    df.to_parquet(
        parquet_path,
        engine="pyarrow",
        index=False
    )

    print(f"✔ Converted {txt_path} → {parquet_path}")

In [32]:
BASE_TXT = "gs://sec-financials-edgar/sec_edgar_financials/2009q1"
BASE_PARQ = "gs://sec-financials-edgar/sec_edgar_financials/sec_financials/parquet/2009q1"

files = ["sub", "num", "tag", "pre"]

for f in files:
    txt_to_parquet(
        f"{BASE_TXT}/{f}.txt",
        f"{BASE_PARQ}/{f}.parquet"
    )

✔ Converted gs://sec-financials-edgar/sec_edgar_financials/2009q1/sub.txt → gs://sec-financials-edgar/sec_edgar_financials/sec_financials/parquet/2009q1/sub.parquet
✔ Converted gs://sec-financials-edgar/sec_edgar_financials/2009q1/num.txt → gs://sec-financials-edgar/sec_edgar_financials/sec_financials/parquet/2009q1/num.parquet
✔ Converted gs://sec-financials-edgar/sec_edgar_financials/2009q1/tag.txt → gs://sec-financials-edgar/sec_edgar_financials/sec_financials/parquet/2009q1/tag.parquet
✔ Converted gs://sec-financials-edgar/sec_edgar_financials/2009q1/pre.txt → gs://sec-financials-edgar/sec_edgar_financials/sec_financials/parquet/2009q1/pre.parquet


In [33]:
for f in files:
    txt_df = pd.read_csv(f"{BASE_TXT}/{f}.txt", sep="\t", low_memory=False)
    parq_df = pd.read_parquet(f"{BASE_PARQ}/{f}.parquet")

    print(
        f,
        txt_df.shape,
        parq_df.shape,
        txt_df.columns.equals(parq_df.columns)
    )

sub (0, 36) (0, 36) True
num (0, 10) (0, 10) True
tag (0, 9) (0, 9) True
pre (0, 10) (0, 10) True
