In [1]:
import pandas as pd

# 1. Load the TSV file into a DataFrame
df = pd.read_csv("/Users/zealot/Downloads/2020q1_form13f/INFOTABLE.tsv", sep="\t")

# 2. Count the total number of unique companies
total_companies = df["NAMEOFISSUER"].nunique()

# 3. Count how many records (rows) each company has
company_record_counts = df["NAMEOFISSUER"].value_counts()

# 4. Get the top 10 companies with the most records
top_10_companies = company_record_counts.head(20)

# 5. Count the total number of unique investment institutions
total_institutions = df["ACCESSION_NUMBER"].nunique()

# 6. For each institution, count how many unique companies they invested in
institution_company_counts = df.groupby("ACCESSION_NUMBER")["NAMEOFISSUER"].nunique()

# 7. Get the top 10 institutions that invested in the most companies
top_10_institutions = institution_company_counts.sort_values(ascending=False).head(10)

# 8. Print the results
print("✅ Total number of companies:", total_companies)
print("✅ Total number of investment institutions:", total_institutions)

print("\n📊 Top 10 companies by number of records:")
print(top_10_companies)

print("\n🏦 Top 10 institutions by number of companies invested in:")
print(top_10_institutions)


✅ Total number of companies: 152346
✅ Total number of investment institutions: 5712

📊 Top 10 companies by number of records:
NAMEOFISSUER
ISHARES TR                      59963
SPDR SERIES TRUST               13362
VANGUARD INDEX FDS               9777
SELECT SECTOR SPDR TR            8353
SCHWAB STRATEGIC TR              7550
ISHARES INC                      6803
INVESCO EXCHANGE TRADED FD T     6441
ALPHABET INC                     6278
INVESCO EXCHNG TRADED FD TR      6060
APPLE INC                        5097
FIRST TR EXCHANGE TRADED FD      4987
WISDOMTREE TR                    4906
MICROSOFT CORP                   4582
AMAZON COM INC                   4157
VANGUARD INTL EQUITY INDEX F     4136
SPDR S&P 500 ETF TR              3634
JOHNSON & JOHNSON                3545
VANECK VECTORS ETF TRUST         3506
FACEBOOK INC                     3420
VANGUARD SCOTTSDALE FDS          3419
Name: count, dtype: int64

🏦 Top 10 institutions by number of companies invested in:
ACCESSION_NUMBER

In [2]:
df.columns

Index(['ACCESSION_NUMBER', 'INFOTABLE_SK', 'NAMEOFISSUER', 'TITLEOFCLASS',
       'CUSIP', 'FIGI', 'VALUE', 'SSHPRNAMT', 'SSHPRNAMTTYPE', 'PUTCALL',
       'INVESTMENTDISCRETION', 'OTHERMANAGER', 'VOTING_AUTH_SOLE',
       'VOTING_AUTH_SHARED', 'VOTING_AUTH_NONE'],
      dtype='object')

In [10]:
from datamule import Portfolio

portfolio_path_13F_HR_path = "/Users/zealot/Documents/SEC_13F-HR/portfolio_output_dir"

portfolio = Portfolio(portfolio_path_13F_HR_path)
text_query="APPLE INC"
text_query="Broadstone Net Lease, Inc"
text_query = text_query.upper()
filing_date=('2024-01-01', '2024-03-31')
submission_type='13F-HR'
# ret = portfolio.filter_text(text_query=text_query,submission_type=submission_type,filing_date=filing_date)
# for document in portfolio.contains_string(r'APPLE INC'):
#     doc_type = document.type
#     content = document.content
#     print(doc_type)
#     print(content)
#     print("***************"*10)


Loading 7591 submissions


Loading submissions: 100%|██████████| 7591/7591 [00:00<00:00, 18062.08it/s]

Successfully loaded 7591 submissions





In [17]:
print(text_query)
text_query="BROADSTONE NET LEASE INC"

BROADSTONE NET LEASE, INC


In [7]:
print(ret)

None


In [4]:
print(len(portfolio.accession_numbers))

AttributeError: 'Portfolio' object has no attribute 'accession_numbers'

In [20]:
def callback_function(document):
    try:
        if document.contains_string(text_query):
            return document.content
    except Exception as e:
        pass
        # print(f"Error processing document: {e}")

# Process submissions - note that filters are applied here
ret = portfolio.process_documents(callback=callback_function)

Processing documents: 100%|██████████| 15182/15182 [00:00<00:00, 243743.25it/s]


In [21]:
print(len(ret))
ret[9]

15182


In [7]:
import xml.etree.ElementTree as ET

# 注册命名空间（SEC 13F XML 文件有默认命名空间）
ns = {'ns': 'http://www.sec.gov/edgar/document/thirteenf/informationtable'}

# 读取并解析 XML 文件
root = ET.fromstring(ret[9])


# 遍历每个 infoTable 节点
for info in root.findall("ns:infoTable", ns):
    name = info.find("ns:nameOfIssuer", ns).text.strip()
    if name == "APPLE INC":
        value = info.find("ns:value", ns).text
        print(f"Found APPLE INC. Value: {value}")

Found APPLE INC. Value: 133867817


In [32]:
def get_value_for_issuer(xml_string: str, issuer_name: str = "APPLE INC") -> str:
    """
    Parse the XML string and return the <value> for the given <nameOfIssuer>.

    Parameters:
        xml_string (str): The XML content as a string.
        issuer_name (str): The name of the issuer to search for (default is "APPLE INC").

    Returns:
        str: The <value> of the first matching issuer, or None if not found.
    """
    ns = {'ns': 'http://www.sec.gov/edgar/document/thirteenf/informationtable'}
    root = ET.fromstring(xml_string)

    for info in root.findall("ns:infoTable", ns):
        name = info.find("ns:nameOfIssuer", ns).text.strip()
        if name.upper() == issuer_name.upper():
            value = info.find("ns:value", ns).text
            return value  # Return the first match

    return None


[1] Value for APPLE INC: 5410389
[3] Value for APPLE INC: 7664942
[5] Value for APPLE INC: 252
[9] Value for APPLE INC: 133867817
[25] Value for APPLE INC: 12456357
[27] Value for APPLE INC: 12832112
[35] Value for APPLE INC: 22661229
[37] Value for APPLE INC: 23843836
[41] Value for APPLE INC: 13406693
[43] Value for APPLE INC: 6826443
[47] Value for APPLE INC: 39148920
[51] Value for APPLE INC: 7437083688
[53] Value for APPLE INC: 2115311
[57] Value for APPLE INC: 3970664
[59] Value for APPLE INC: 145909607
[67] Value for APPLE INC: 829678
[79] Value for APPLE INC: 7272028
[81] Value for APPLE INC: 8448283
[85] Value for APPLE INC: 30069143
[89] Value for APPLE INC: 36778313
[91] Value for APPLE INC: 28303605
[93] Value for APPLE INC: 3833463
[95] Value for APPLE INC: 231036
[99] Value for APPLE INC: 1237698
[101] Value for APPLE INC: 2132126
[109] Value for APPLE INC: 117638766
[111] Value for APPLE INC: 38488480
[113] Value for APPLE INC: 29690030
[115] Value for APPLE INC: 1049447

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


[13069] Value for APPLE INC: 21586464
[13071] Value for APPLE INC: 1252558
[13073] Value for APPLE INC: 2942276
[13077] Value for APPLE INC: 4936672
[13079] Value for APPLE INC: 2446961
[13083] Value for APPLE INC: 6403388
[13085] Value for APPLE INC: 53543
[13087] Value for APPLE INC: 9011930
[13091] Value for APPLE INC: 9900507
[13095] Value for APPLE INC: 2436851
[13097] Value for APPLE INC: 109733629
[13099] Value for APPLE INC: 3990618
[13101] Value for APPLE INC: 308433
[13103] Value for APPLE INC: 894750
[13105] Value for APPLE INC: 3778123
[13111] Value for APPLE INC: 2714342
[13117] Value for APPLE INC: 26216425
[13119] Value for APPLE INC: 1588758
[13129] Value for APPLE INC: 13234012
[13133] Value for APPLE INC: 1264720
[13135] Value for APPLE INC: 6633429
[13139] Value for APPLE INC: 6495778
[13141] Value for APPLE INC: 42285941
[13145] Value for APPLE INC: 17271674
[13147] Value for APPLE INC: 64479383
[13153] Value for APPLE INC: 13881645
[13155] Value for APPLE INC: 1670

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ip

[13261] Value for APPLE INC: 4461498
[13263] Value for APPLE INC: 8676615
[13265] Value for APPLE INC: 31347838
[13275] Value for APPLE INC: 1318
[13277] Value for APPLE INC: 777822
[13279] Value for APPLE INC: 19192745
[13281] Value for APPLE INC: 11742271
[13285] Value for APPLE INC: 16230171
[13291] Value for APPLE INC: 10652492
[13293] Value for APPLE INC: 920920
[13295] Value for APPLE INC: 3596293
[13297] Value for APPLE INC: 926249
[13301] Value for APPLE INC: 736427
[13303] Value for APPLE INC: 964726
[13307] Value for APPLE INC: 91642


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


[13319] Value for APPLE INC: 1348096
[13321] Value for APPLE INC: 18955673
[13323] Value for APPLE INC: 19578585
[13333] Value for APPLE INC: 3627928
[13337] Value for APPLE INC: 99589222
[13339] Value for APPLE INC: 3116
[13345] Value for APPLE INC: 50433813
[13347] Value for APPLE INC: 7701


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


[13349] Value for APPLE INC: 103261540
[13351] Value for APPLE INC: 2044326
[13355] Value for APPLE INC: 5596486
[13363] Value for APPLE INC: 189632616
[13365] Value for APPLE INC: 4124495
[13367] Value for APPLE INC: 15595
[13371] Value for APPLE INC: 64539353
[13375] Value for APPLE INC: 5663840


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x109e57670>>
Traceback (most recent call last):
  File "/Users/zealot/.conda/envs/datamule38/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [22]:
filtered_ret = [item for item in ret if item is not None]

print(len(filtered_ret))

252


In [36]:
sum=0
for i, item in enumerate(filtered_ret):
    if item is not None:
        try:
            value = get_value_for_issuer(item, issuer_name="APPLE INC")
            if value is not None:
                sum+= int(value)
                print(f"[{i}] Value for APPLE INC:", value)
        except Exception as e:
            print(f"[{i}] Error processing item: {e}")



[0] Value for APPLE INC: 5410389
[1] Value for APPLE INC: 7664942
[2] Value for APPLE INC: 252
[3] Value for APPLE INC: 133867817
[5] Value for APPLE INC: 12456357
[6] Value for APPLE INC: 12832112
[8] Value for APPLE INC: 22661229
[9] Value for APPLE INC: 23843836
[10] Value for APPLE INC: 13406693
[11] Value for APPLE INC: 6826443
[13] Value for APPLE INC: 39148920
[15] Value for APPLE INC: 7437083688
[16] Value for APPLE INC: 2115311
[17] Value for APPLE INC: 3970664
[18] Value for APPLE INC: 145909607
[19] Value for APPLE INC: 829678
[22] Value for APPLE INC: 7272028
[23] Value for APPLE INC: 8448283
[24] Value for APPLE INC: 30069143
[25] Value for APPLE INC: 36778313
[26] Value for APPLE INC: 28303605
[27] Value for APPLE INC: 3833463
[28] Value for APPLE INC: 231036
[29] Value for APPLE INC: 1237698
[30] Value for APPLE INC: 2132126
[32] Value for APPLE INC: 117638766
[33] Value for APPLE INC: 38488480
[34] Value for APPLE INC: 29690030
[35] Value for APPLE INC: 10494479
[36] Va

In [37]:
print(sum)

651402179283


In [15]:
def callback_function(document):
    # if document.contains_string("APPLE INC"):
    #     text = document.content
    #     print(text)
        # if isinstance(text, bytes):
        #     text = text.decode("utf-8", errors="ignore")  # 防止解码失败
        #
        #     return text
        # print()
        # print("================" * 5)

    # return document.path

# Process submissions - note that filters are applied here
ret = portfolio.process_documents(callback=callback_function)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing documents:   1%|          | 151/15182 [00:00<00:01, 7603.67it/s]


TypeError: cannot use a string pattern on a bytes-like object

In [12]:
# print(ret[0])