In [1]:
# Auto-reload modules before executing code
# This allows you to edit source files without restarting the kernel
%load_ext autoreload
%autoreload 2

## Setup

Import the necessary modules.

In [2]:
import sys
sys.path.insert(0, '../src')

from reit_risk_summarizer.services.sec import SECFetcher, RiskFactorExtractor

## Fetch a 10-K

Fetch a real 10-K filing to test the extractor.

In [3]:
# Fetch a 10-K
fetcher = SECFetcher()
ticker = "PLD"

print(f"Fetching 10-K for {ticker}...")
html = fetcher.fetch_latest_10k(ticker)
print(f"Downloaded {len(html):,} bytes")

Fetching 10-K for PLD...
Downloaded 11,157,030 bytes


## Extract Risk Factors

Use the extractor to extract Item 1A Risk Factors section.

In [4]:
# Extract risk factors
extractor = RiskFactorExtractor()

print("Extracting risk factors...")
risk_factors = extractor.extract_risk_factors(html)

print(f"\n✓ Extracted {len(risk_factors):,} characters")
print(f"\nFirst 1000 characters:\n{risk_factors[:1000]}...")

Extracting risk factors...

✓ Extracted 90,956 characters

First 1000 characters:
PART I 
ITEM 1. Business
Prologis, Inc. is a self-administered and self-managed REIT and is the sole general partner of Prologis, L.P. through which it holds substantially all of its assets. We operate Prologis, Inc. and Prologis, L.P. as one enterprise and, therefore, our discussion and analysis refers to Prologis, Inc. and its consolidated subsidiaries, including Prologis, L.P. We invest in real estate through wholly owned subsidiaries and other entities through which we co-invest with partners and investors ("co-investment ventures"). We have a significant ownership interest in the co-investment ventures, which are either consolidated or unconsolidated based on our level of control of the entity.
Prologis, Inc. began operating as a fully integrated real estate company in 1997 and elected to be taxed as a REIT under the Internal Revenue Code of 1986, as amended (“Internal Revenue Code” or “IRC”). We bel

## Verify Content

Check that the extracted content looks reasonable.

In [5]:
# Check for key risk-related terms
terms = ['risk', 'may', 'could', 'market', 'operations', 'business']

print("Risk factor content analysis:")
for term in terms:
    count = risk_factors.lower().count(term)
    print(f"  '{term}': appears {count} times")

# Check length is reasonable
print(f"\nTotal length: {len(risk_factors):,} characters")
print(f"Word count (approx): {len(risk_factors.split()):,} words")

Risk factor content analysis:
  'risk': appears 65 times
  'may': appears 143 times
  'could': appears 51 times
  'market': appears 59 times
  'operations': appears 39 times
  'business': appears 84 times

Total length: 90,956 characters
Word count (approx): 13,697 words


## Test with Multiple Tickers

Test extraction with different REIT companies.

In [10]:
# Test with multiple tickers
tickers = ["PLD", "AMT", "EQIX"]
results = {}

for ticker in tickers:
    try:
        print(f"\nProcessing {ticker}...")
        html = fetcher.fetch_latest_10k(ticker)
        risk_factors = extractor.extract_risk_factors(html)
        
        results[ticker] = {
            'success': True,
            'length': len(risk_factors),
            'word_count': len(risk_factors.split()),
            'preview': risk_factors[:200]
        }
        print(f"  ✓ Extracted {len(risk_factors):,} chars ({len(risk_factors.split()):,} words)")
    except Exception as e:
        results[ticker] = {'success': False, 'error': str(e)}
        print(f"  ✗ Error: {e}")


Processing PLD...
  ✓ Extracted 90,956 chars (13,697 words)

Processing AMT...
  ✓ Extracted 119,552 chars (17,619 words)

Processing EQIX...
  ✓ Extracted 128,406 chars (19,739 words)


In [11]:
# Summary table
import pandas as pd

summary = []
for ticker, result in results.items():
    if result['success']:
        summary.append({
            'Ticker': ticker,
            'Characters': f"{result['length']:,}",
            'Words': f"{result['word_count']:,}",
            'Preview': result['preview'][:50] + '...'
        })
    else:
        summary.append({
            'Ticker': ticker,
            'Characters': 'Error',
            'Words': 'Error',
            'Preview': result['error'][:50]
        })

pd.DataFrame(summary)

Unnamed: 0,Ticker,Characters,Words,Preview
0,PLD,90956,13697,"PART I \nITEM 1. Business\nPrologis, Inc. is a..."
1,AMT,119552,17619,AMERICAN TOWER CORPORATION\nTABLE OF CONTENTS—...
2,EQIX,128406,19739,In addition to the other information contained...


## Save Sample

Save an extracted risk factors sample for inspection.

In [None]:
# Save extracted risk factors
ticker = "AMT"
html = fetcher.fetch_latest_10k(ticker)
risk_factors = extractor.extract_risk_factors(html)

output_file = f"../sample_{ticker}_risk_factors.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(risk_factors)

print(f"Saved risk factors to: {output_file}")
print(f"File size: {len(risk_factors):,} characters")

Saved risk factors to: ../sample_AMT_risk_factors.txt
File size: 119,552 characters


: 

## Download and Save AMT HTML

Download the full 10-K HTML for AMT and save it for review.

In [9]:
# Download AMT 10-K HTML
ticker = "AMT"
print(f"Fetching 10-K for {ticker}...")
html = fetcher.fetch_latest_10k(ticker)

# Save to file
output_file = f"../sample_{ticker}_10k.html"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(html)

print(f"\n✓ Downloaded {len(html):,} bytes")
print(f"✓ Saved to: {output_file}")
print(f"\nYou can now open this file in a browser or text editor to review the HTML structure.")

Fetching 10-K for AMT...

✓ Downloaded 4,628,843 bytes
✓ Saved to: ../sample_AMT_10k.html

You can now open this file in a browser or text editor to review the HTML structure.
