In [1]:
import pandas as pd

# 1. Load the TSV file into a DataFrame
df = pd.read_csv("/Users/zealot/Downloads/2020q1_form13f/INFOTABLE.tsv", sep="\t")

# 2. Count the total number of unique companies
total_companies = df["NAMEOFISSUER"].nunique()

# 3. Count how many records (rows) each company has
company_record_counts = df["NAMEOFISSUER"].value_counts()

# 4. Get the top 10 companies with the most records
top_10_companies = company_record_counts.head(20)

# 5. Count the total number of unique investment institutions
total_institutions = df["ACCESSION_NUMBER"].nunique()

# 6. For each institution, count how many unique companies they invested in
institution_company_counts = df.groupby("ACCESSION_NUMBER")["NAMEOFISSUER"].nunique()

# 7. Get the top 10 institutions that invested in the most companies
top_10_institutions = institution_company_counts.sort_values(ascending=False).head(10)

# 8. Print the results
print("✅ Total number of companies:", total_companies)
print("✅ Total number of investment institutions:", total_institutions)

print("\n📊 Top 10 companies by number of records:")
print(top_10_companies)

print("\n🏦 Top 10 institutions by number of companies invested in:")
print(top_10_institutions)


✅ Total number of companies: 152346
✅ Total number of investment institutions: 5712

📊 Top 10 companies by number of records:
NAMEOFISSUER
ISHARES TR                      59963
SPDR SERIES TRUST               13362
VANGUARD INDEX FDS               9777
SELECT SECTOR SPDR TR            8353
SCHWAB STRATEGIC TR              7550
ISHARES INC                      6803
INVESCO EXCHANGE TRADED FD T     6441
ALPHABET INC                     6278
INVESCO EXCHNG TRADED FD TR      6060
APPLE INC                        5097
FIRST TR EXCHANGE TRADED FD      4987
WISDOMTREE TR                    4906
MICROSOFT CORP                   4582
AMAZON COM INC                   4157
VANGUARD INTL EQUITY INDEX F     4136
SPDR S&P 500 ETF TR              3634
JOHNSON & JOHNSON                3545
VANECK VECTORS ETF TRUST         3506
FACEBOOK INC                     3420
VANGUARD SCOTTSDALE FDS          3419
Name: count, dtype: int64

🏦 Top 10 institutions by number of companies invested in:
ACCESSION_NUMBER

In [2]:
df.columns

Index(['ACCESSION_NUMBER', 'INFOTABLE_SK', 'NAMEOFISSUER', 'TITLEOFCLASS',
       'CUSIP', 'FIGI', 'VALUE', 'SSHPRNAMT', 'SSHPRNAMTTYPE', 'PUTCALL',
       'INVESTMENTDISCRETION', 'OTHERMANAGER', 'VOTING_AUTH_SOLE',
       'VOTING_AUTH_SHARED', 'VOTING_AUTH_NONE'],
      dtype='object')

In [3]:
from datamule import Portfolio

portfolio_path_13F_HR_path = "/Users/zealot/Documents/SEC_13F-HR/portfolio_output_dir"

portfolio = Portfolio(portfolio_path_13F_HR_path)
text_query="APPLE INC"
filing_date=('2024-01-01', '2024-03-31')
submission_type='13F-HR'
ret = portfolio.filter_text(text_query=text_query,submission_type=submission_type,filing_date=filing_date)
# for document in portfolio.contains_string(r'APPLE INC'):
#     doc_type = document.type
#     content = document.content
#     print(doc_type)
#     print(content)
#     print("***************"*10)


  from .autonotebook import tqdm as notebook_tqdm


Loading 7591 submissions


Loading submissions: 100%|██████████| 7591/7591 [00:00<00:00, 23144.35it/s]


Successfully loaded 7591 submissions

--- Starting query planning phase ---
Analyzing request and splitting into manageable chunks...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=0&size=1...
Found 5,034 total documents to retrieve.
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=0&size=1...
Planning: Analyzing query: forms=13F-HR,-13F-HR/A, dates=2024-01-01 to 2024-03-31 [5,034 hits]
No additional forms to process with negation

--- Starting query phase ---


Querying documents [Rate: 0/s | 0 MB/s]:   0%|          | 0/5034 [00:00<?, ?it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=0&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=100&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=200&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=300&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=400&size=100...


Querying documents [Rate: 3.0/s | 0.17 MB/s]:   2%|▏         | 100/5034 [00:00<00:20, 242.27it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=500&size=100...


Querying documents [Rate: 2.0/s | 0.11 MB/s]:   4%|▍         | 200/5034 [00:01<00:28, 168.21it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=600&size=100...


Querying documents [Rate: 3.0/s | 0.17 MB/s]:   6%|▌         | 300/5034 [00:01<00:28, 166.60it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=700&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=800&size=100...


Querying documents [Rate: 4.0/s | 0.23 MB/s]:  10%|▉         | 500/5034 [00:01<00:15, 299.99it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=900&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  14%|█▍        | 700/5034 [00:02<00:11, 375.73it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1000&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1100&size=100...


Querying documents [Rate: 7.0/s | 0.4 MB/s]:  18%|█▊        | 900/5034 [00:02<00:07, 582.30it/s] 

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1200&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1300&size=100...


Querying documents [Rate: 7.0/s | 0.4 MB/s]:  22%|██▏       | 1100/5034 [00:02<00:07, 554.01it/s] 

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1400&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1500&size=100...


Querying documents [Rate: 7.0/s | 0.4 MB/s]:  24%|██▍       | 1200/5034 [00:03<00:08, 444.93it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1600&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1700&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  30%|██▉       | 1500/5034 [00:03<00:06, 546.75it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1800&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=1900&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  34%|███▍      | 1700/5034 [00:04<00:06, 547.05it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2000&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2100&size=100...


Querying documents [Rate: 7.0/s | 0.4 MB/s]:  36%|███▌      | 1800/5034 [00:04<00:06, 529.48it/s] 

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2200&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  38%|███▊      | 1900/5034 [00:04<00:07, 416.41it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2300&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2400&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  44%|████▎     | 2200/5034 [00:05<00:05, 541.52it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2500&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2600&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  46%|████▌     | 2300/5034 [00:05<00:05, 523.41it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2700&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  50%|████▉     | 2500/5034 [00:05<00:05, 490.96it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2800&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=2900&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  52%|█████▏    | 2600/5034 [00:06<00:04, 513.76it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3000&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  54%|█████▎    | 2700/5034 [00:06<00:04, 507.74it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3100&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  56%|█████▌    | 2800/5034 [00:06<00:04, 500.53it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3200&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  60%|█████▉    | 3000/5034 [00:06<00:03, 521.64it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3300&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3400&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  62%|██████▏   | 3100/5034 [00:06<00:03, 500.02it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3500&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  66%|██████▌   | 3300/5034 [00:07<00:03, 503.05it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3600&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3700&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  68%|██████▊   | 3400/5034 [00:07<00:03, 503.28it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3800&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  70%|██████▉   | 3500/5034 [00:07<00:03, 479.11it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=3900&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  74%|███████▎  | 3700/5034 [00:08<00:02, 500.26it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4000&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4100&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  77%|███████▋  | 3900/5034 [00:08<00:02, 498.88it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4200&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4300&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  81%|████████▏ | 4100/5034 [00:08<00:01, 497.46it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4400&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4500&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  83%|████████▎ | 4200/5034 [00:09<00:01, 500.65it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4600&size=100...


Querying documents [Rate: 6.0/s | 0.34 MB/s]:  87%|████████▋ | 4400/5034 [00:09<00:01, 510.44it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4700&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4800&size=100...


Querying documents [Rate: 5.0/s | 0.29 MB/s]:  89%|████████▉ | 4500/5034 [00:09<00:01, 405.35it/s]

Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=4900&size=100...
Fetching https://efts.sec.gov/LATEST/search-index?forms=13F-HR%2C-13F-HR%2FA&startdt=2024-01-01&enddt=2024-03-31&q=APPLE+INC&from=5000&size=100...


Querying documents [Rate: 5.0/s | 0.25 MB/s]: 100%|██████████| 5034/5034 [00:11<00:00, 457.43it/s]


--- Query complete: 5,034 submissions retrieved ---





In [6]:
print(type(ret))

<class 'NoneType'>


In [7]:
print(ret)

None


In [8]:
print(len(portfolio.accession_numbers))

5034


In [24]:
def callback_function(document):
    try:
        if document.contains_string("APPLE INC"):
            return document.content
    except Exception as e:
        print(f"Error processing document: {e}")

# Process submissions - note that filters are applied here
ret = portfolio.process_documents(callback=callback_function)

Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a bytes-like object
Error processing document: cannot use a string pattern on a byte

Processing documents: 100%|██████████| 15182/15182 [00:00<00:00, 244886.83it/s]


In [25]:
ret[9]

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<ns1:informationTable xmlns:ns1="http://www.sec.gov/edgar/document/thirteenf/informationtable">\n\t<ns1:infoTable>\n\t\t<ns1:nameOfIssuer>ABB LTD SP ADR</ns1:nameOfIssuer>\n\t\t<ns1:titleOfClass>ADR</ns1:titleOfClass>\n\t\t<ns1:cusip>000375204</ns1:cusip>\n\t\t<ns1:value>33477466</ns1:value>\n\t\t<ns1:shrsOrPrnAmt>\n\t\t\t<ns1:sshPrnamt>755699</ns1:sshPrnamt>\n\t\t\t<ns1:sshPrnamtType>SH</ns1:sshPrnamtType>\n\t\t</ns1:shrsOrPrnAmt>\n\t\t<ns1:investmentDiscretion>SOLE</ns1:investmentDiscretion>\n\t\t<ns1:votingAuthority>\n\t\t\t<ns1:Sole>740409</ns1:Sole>\n\t\t\t<ns1:Shared>0</ns1:Shared>\n\t\t\t<ns1:None>15290</ns1:None>\n\t\t</ns1:votingAuthority>\n\t</ns1:infoTable>\n\t<ns1:infoTable>\n\t\t<ns1:nameOfIssuer>ASML HOLDING NV</ns1:nameOfIssuer>\n\t\t<ns1:titleOfClass>ADR</ns1:titleOfClass>\n\t\t<ns1:cusip>N07059210</ns1:cusip>\n\t\t<ns1:value>26320379</ns1:value>\n\t\t<ns1:shrsOrPrnAmt>\n\t\t\t<ns1:sshPrnamt>34773</ns1:sshPrnamt>

In [27]:
import xml.etree.ElementTree as ET

# 注册命名空间（SEC 13F XML 文件有默认命名空间）
ns = {'ns': 'http://www.sec.gov/edgar/document/thirteenf/informationtable'}

# 读取并解析 XML 文件
root = ET.fromstring(ret[9])


# 遍历每个 infoTable 节点
for info in root.findall("ns:infoTable", ns):
    name = info.find("ns:nameOfIssuer", ns).text.strip()
    if name == "APPLE INC":
        value = info.find("ns:value", ns).text
        print(f"Found APPLE INC. Value: {value}")

Found APPLE INC. Value: 133867817


In [15]:
def callback_function(document):
    # if document.contains_string("APPLE INC"):
    #     text = document.content
    #     print(text)
        # if isinstance(text, bytes):
        #     text = text.decode("utf-8", errors="ignore")  # 防止解码失败
        #
        #     return text
        # print()
        # print("================" * 5)

    # return document.path

# Process submissions - note that filters are applied here
ret = portfolio.process_documents(callback=callback_function)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing documents:   1%|          | 151/15182 [00:00<00:01, 7603.67it/s]


TypeError: cannot use a string pattern on a bytes-like object

In [12]:
# print(ret[0])