In [4]:
# Auto-reload modules before executing code
# This allows you to edit source files without restarting the kernel
%load_ext autoreload
%autoreload 2

## Setup

Import the necessary modules and create a fetcher instance.

In [5]:
import sys

sys.path.insert(0, "../src")

from reit_risk_summarizer.services.sec import SECFetcher

In [6]:
# Create a fetcher instance
fetcher = SECFetcher()
print("SEC Fetcher initialized successfully!")
print(f"User-Agent: {fetcher.session.headers['User-Agent']}")

SEC Fetcher initialized successfully!
User-Agent: YourName your.email@example.com


## Test CIK Lookup

Test looking up a company's CIK (Central Index Key) from its ticker symbol.

In [None]:
# Test CIK lookup
ticker = "PLD"  # Prologis
cik = fetcher._get_cik(ticker)
print(f"CIK for {ticker}: {cik}")

CIK for PLD: 1045609


: 

In [4]:
# Try other tickers
for ticker in ["AMT", "EQIX", "PSA"]:
    try:
        cik = fetcher._get_cik(ticker)
        print(f"{ticker}: CIK {cik}")
    except Exception as e:
        print(f"{ticker}: Error - {e}")

AMT: CIK 1053507
EQIX: CIK 1101239
PSA: CIK 1393311
PSA: CIK 1393311


## Fetch Complete 10-K Filing

Fetch the complete HTML of a 10-K filing.

In [5]:
# Fetch a 10-K filing
ticker = "PLD"
print(f"Fetching 10-K for {ticker}...")
html = fetcher.fetch_latest_10k(ticker)
print(f"\nSuccess! Downloaded {len(html):,} bytes")
print(f"First 500 characters:\n{html[:500]}...")

Fetching 10-K for PLD...


Transient error on attempt 1/3: HTTPSConnectionPool(host='www.sec.gov', port=443): Read timed out. (read timeout=30). Retrying in 1.0s...



Success! Downloaded 11,157,030 bytes
First 500 characters:
<?xml version='1.0' encoding='ASCII'?>
<!-- DFIN New ActiveDisclosure (SM) Inline XBRL Document - http://www.dfinsolutions.com/ -->
<!-- Creation Date :2025-02-14T16:22:50.3037+00:00 -->
<!-- Copyright (c) 2025 Donnelley Financial Solutions, Inc. All Rights Reserved. -->
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xmlns:xbrldt="http://xbrl.org/2005/xbrldt" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:link="http://www.xbrl.org/2003/linkbase" xmln...


## Check for Risk Factors Section

Verify that the fetched HTML contains the risk factors section.

In [6]:
# Check for risk factors
html_lower = html.lower()
has_risk_factors = "risk factors" in html_lower
has_item_1a = "item 1a" in html_lower

print(f"Contains 'risk factors': {has_risk_factors}")
print(f"Contains 'item 1a': {has_item_1a}")

# Find the position of risk factors
if has_risk_factors:
    pos = html_lower.find("risk factors")
    print(f"\n'Risk Factors' found at position {pos:,}")
    print(f"Context around 'Risk Factors':\n{html[pos - 100 : pos + 200]}")

Contains 'risk factors': True
Contains 'item 1a': True

'Risk Factors' found at position 621,352
Context around 'Risk Factors':
re-wrap;text-decoration:underline solid;font-family:Arial;font-kerning:none;min-width:fit-content;">Risk Factors</span></a></p></td>
    <td style="padding-top:0in;vertical-align:top;padding-bottom:0in;text-align:center;"><p style="font-size:9pt;margin-top:0;font-family:Times New Roman;margin-bottom


## Test Multiple REITs

Fetch 10-K filings for multiple REIT tickers from the golden dataset.

In [7]:
# Golden dataset tickers
tickers = ["PLD", "AMT", "EQIX", "PSA", "O"]

results = {}
for ticker in tickers:
    try:
        print(f"\nFetching {ticker}...")
        html = fetcher.fetch_latest_10k(ticker)
        results[ticker] = {
            "size": len(html),
            "has_risk_factors": "risk factors" in html.lower(),
            "success": True,
        }
        print(f"  ✓ Success: {len(html):,} bytes")
    except Exception as e:
        results[ticker] = {"success": False, "error": str(e)}
        print(f"  ✗ Error: {e}")


Fetching PLD...
  ✓ Success: 11,157,030 bytes

Fetching AMT...
  ✓ Success: 11,157,030 bytes

Fetching AMT...
  ✓ Success: 4,628,843 bytes

Fetching EQIX...
  ✓ Success: 4,628,843 bytes

Fetching EQIX...


Transient error on attempt 1/3: HTTPSConnectionPool(host='www.sec.gov', port=443): Read timed out. (read timeout=30). Retrying in 1.0s...
Transient error on attempt 1/3: HTTPSConnectionPool(host='www.sec.gov', port=443): Read timed out. (read timeout=30). Retrying in 1.0s...
Transient error on attempt 1/3: HTTPSConnectionPool(host='www.sec.gov', port=443): Read timed out. (read timeout=30). Retrying in 1.0s...


  ✓ Success: 5,205,947 bytes

Fetching PSA...
  ✓ Success: 3,418,639 bytes

Fetching O...
  ✓ Success: 3,418,639 bytes

Fetching O...
  ✓ Success: 4,000,662 bytes
  ✓ Success: 4,000,662 bytes


In [8]:
# Summary of results
import pandas as pd

summary_data = []
for ticker, result in results.items():
    if result["success"]:
        summary_data.append(
            {
                "Ticker": ticker,
                "Size (bytes)": f"{result['size']:,}",
                "Has Risk Factors": "✓" if result["has_risk_factors"] else "✗",
                "Status": "✓ Success",
            }
        )
    else:
        summary_data.append(
            {
                "Ticker": ticker,
                "Size (bytes)": "N/A",
                "Has Risk Factors": "N/A",
                "Status": f"✗ {result['error']}",
            }
        )

df = pd.DataFrame(summary_data)
df

Unnamed: 0,Ticker,Size (bytes),Has Risk Factors,Status
0,PLD,11157030,✓,✓ Success
1,AMT,4628843,✓,✓ Success
2,EQIX,5205947,✓,✓ Success
3,PSA,3418639,✓,✓ Success
4,O,4000662,✓,✓ Success


## Error Handling Test

Test how the fetcher handles invalid tickers.

In [9]:
# Test invalid ticker
try:
    html = fetcher.fetch_latest_10k("INVALID_XYZ_123")
except Exception as e:
    print(f"Expected error occurred: {type(e).__name__}")
    print(f"Message: {e}")

Expected error occurred: InvalidTickerError
Message: Ticker 'INVALID_XYZ_123' not found in SEC database


## Save Sample HTML

Save a sample 10-K HTML file for inspection.

In [11]:
# Save sample HTML
ticker = "PLD"
html = fetcher.fetch_latest_10k(ticker)

output_file = f"../sample_{ticker}_10k.html"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(html)

print(f"Saved {ticker} 10-K to: {output_file}")
print(f"File size: {len(html):,} bytes")

Saved PLD 10-K to: ../sample_PLD_10k.html
File size: 11,157,030 bytes


## Retry Logic

The fetcher now includes automatic retry logic for transient errors:
- **Timeouts**: Retries up to 3 times with exponential backoff
- **503 Service Unavailable**: Retries up to 3 times with exponential backoff  
- **Other errors** (404, 400, etc.): No retry, fails immediately

This helps handle the SEC API's unreliability during high load periods.