## Setup

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path updated: {str(project_root) in sys.path}")

Project root: c:\Users\H244746\Documents\reit-risk-summarizer
Python path updated: True


In [2]:
from evaluation.metrics import (
    SimilarityMetrics,
    calculate_ndcg_at_k,
    calculate_sector_specificity,
    evaluate_summary,
    get_embedding_model,
)

print("✓ Successfully imported metrics module")

✓ Successfully imported metrics module


## Load Golden Dataset

In [3]:
# Load golden dataset
dataset_path = project_root / "evaluation" / "golden_dataset.csv"
df = pd.read_csv(dataset_path)

print(f"Loaded {len(df)} rows from golden dataset")
print(f"Unique tickers: {df['ticker'].nunique()}")
print(f"Unique sectors: {df['sector'].nunique()}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

Loaded 50 rows from golden dataset
Unique tickers: 10
Unique sectors: 8

Columns: ['ticker', 'company_name', 'sector', 'filing_year', 'risk_rank', 'risk_category', 'risk_title', 'risk_description', 'why_material', 'unique_to_sector']

First few rows:


Unnamed: 0,ticker,company_name,sector,filing_year,risk_rank,risk_category,risk_title,risk_description,why_material,unique_to_sector
0,PLD,Prologis,Industrial/Logistics,2023,1,Geographic Concentration,California Market Exposure,30% of properties and 31.7% of NOI concentrate...,"Economic downturn, oversupply, or tax changes ...",No
1,PLD,Prologis,Industrial/Logistics,2023,2,Customer Concentration,Major Customer Dependency,Amazon represents 6.4% of revenues; top 10 cus...,Loss of key tenants or unfavorable lease renew...,No
2,PLD,Prologis,Industrial/Logistics,2023,3,Foreign Currency,Foreign Currency & International Operations,$10.6B (11.4% of assets) denominated in foreig...,Currency depreciation or international economi...,No
3,PLD,Prologis,Industrial/Logistics,2023,4,Development,Development Execution Risk,Active development program with significant ca...,"Cost overruns, leasing delays, and financing c...",No
4,PLD,Prologis,Industrial/Logistics,2023,5,Financing,Rising Interest Rates,Variable-rate debt and upcoming refinancings,Higher borrowing costs squeeze margins and lim...,No


In [4]:
# Show sector distribution
print("Sector distribution:")
df.groupby('sector')['ticker'].nunique().sort_values(ascending=False)

Sector distribution:


sector
Data Centers               2
Healthcare                 2
Industrial/Logistics       1
Infrastructure/Towers      1
Residential/Multifamily    1
Retail (Net Lease)         1
Retail/Malls               1
Self Storage               1
Name: ticker, dtype: int64

## Test 1: Semantic Similarity

In [5]:
# Initialize similarity calculator
sim_calc = SimilarityMetrics()
print("✓ Initialized SimilarityMetrics with all-MiniLM-L6-v2 model")

✓ Initialized SimilarityMetrics with all-MiniLM-L6-v2 model


In [6]:
# Test 1.1: Perfect match
print("Test 1.1: Identical risks (should score ~1.0)")
risks = ["Interest rate risk affecting debt refinancing costs"]
score = sim_calc.calculate_similarity(risks, risks)
print(f"Score: {score:.4f}")
print(f"✓ PASS" if score > 0.99 else "✗ FAIL")

Test 1.1: Identical risks (should score ~1.0)
Score: 1.0000
✓ PASS


In [8]:
# Test 1.2: Semantically similar
print("Test 1.2: Similar meaning, different words")
predicted = ["Rising interest rates increase refinancing costs"]
golden = ["Higher rates impact debt servicing expenses"]
score = sim_calc.calculate_similarity(predicted, golden)
print(f"Predicted: {predicted[0]}")
print(f"Golden:    {golden[0]}")
print(f"Score: {score:.4f}")
print(f"✓ PASS (similar meaning)" if score > 0.50 else "✗ FAIL")

Test 1.2: Similar meaning, different words
Predicted: Rising interest rates increase refinancing costs
Golden:    Higher rates impact debt servicing expenses
Score: 0.5428
✓ PASS (similar meaning)


In [9]:
# Test 1.3: Dissimilar risks
print("Test 1.3: Completely different topics (should score low)")
predicted = ["Cybersecurity threats to data centers"]
golden = ["Agricultural commodity price volatility"]
score = sim_calc.calculate_similarity(predicted, golden)
print(f"Predicted: {predicted[0]}")
print(f"Golden:    {golden[0]}")
print(f"Score: {score:.4f}")
print(f"✓ PASS (low similarity)" if score < 0.30 else "✗ FAIL")

Test 1.3: Completely different topics (should score low)
Predicted: Cybersecurity threats to data centers
Golden:    Agricultural commodity price volatility
Score: -0.0158
✓ PASS (low similarity)


In [10]:
# Test 1.4: Real golden dataset example
print("Test 1.4: Using actual golden dataset risks")

# Get PLD (Prologis) golden risks
pld_risks = df[df['ticker'] == 'PLD'].sort_values('risk_rank')
golden_texts = pld_risks['risk_description'].tolist()

print(f"\nGolden risks for PLD ({len(golden_texts)} risks):")
for i, risk in enumerate(golden_texts, 1):
    print(f"{i}. {risk[:100]}...")

# Simulate slightly rephrased predicted risks
predicted_texts = [
    "Economic downturns affect demand for logistics facilities",
    "Competition from other warehouse developers impacts market share",
    "Interest rate changes affect our debt refinancing costs",
    "E-commerce growth drives demand for distribution centers",
    "Supply chain disruptions impact tenant operations",
]

score = sim_calc.calculate_similarity(predicted_texts, golden_texts)
print(f"\nSemantic Similarity Score: {score:.4f}")
print(f"Target: >0.75")
print(f"✓ PASS" if score > 0.75 else f"⚠ Below target (expected for test data)")

Test 1.4: Using actual golden dataset risks

Golden risks for PLD (5 risks):
1. 30% of properties and 31.7% of NOI concentrated in California markets...
2. Amazon represents 6.4% of revenues; top 10 customers account for 15.8%...
3. $10.6B (11.4% of assets) denominated in foreign currencies...
4. Active development program with significant capital deployment...
5. Variable-rate debt and upcoming refinancings...

Semantic Similarity Score: 0.3859
Target: >0.75
⚠ Below target (expected for test data)


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
gold = sim_calc.model.encode(golden_texts)
pred = sim_calc.model.encode(predicted_texts)
cos_sim = cosine_similarity(pred,gold)
print(cos_sim.max(axis = 1).mean())
print(cos_sim)

0.38593036
[[0.20889282 0.16414273 0.15145743 0.2101272  0.19872299]
 [0.2893158  0.33545363 0.15255015 0.34090623 0.14910653]
 [0.08443185 0.1076494  0.2370469  0.10385339 0.7443169 ]
 [0.20509215 0.3764     0.13896717 0.23835188 0.1259457 ]
 [0.16521525 0.15091677 0.10926503 0.25790143 0.21124907]]


## Test 2: NDCG@5 (Ranking Quality)

In [25]:
# Test 2.1: Perfect ranking
print("Test 2.1: Perfect ranking (should score ~1.0)")
risks = [
    "Risk 1: Interest rate exposure",
    "Risk 2: Tenant concentration",
    "Risk 3: Supply chain disruption",
    "Risk 4: Regulatory changes",
    "Risk 5: Market competition",
]
ndcg = calculate_ndcg_at_k(risks, risks, k=5)
print(f"NDCG@5: {ndcg:.4f}")
print(f"✓ PASS" if ndcg > 0.99 else "✗ FAIL")

Test 2.1: Perfect ranking (should score ~1.0)
NDCG@5: 1.0000
✓ PASS


In [26]:
# Test 2.2: Similar ranking with semantic similarity
print("Test 2.2: Similar content, good ranking")
predicted = [
    "Rising interest rates affect debt costs",
    "Major tenant represents 40% of revenue",
    "Supply chain issues delay projects",
    "New regulations increase compliance",
    "Competitive pressure on pricing",
]
golden = [
    "Interest rate risk on refinancing",
    "Tenant concentration creates dependency",
    "Supply disruptions impact timelines",
    "Regulatory environment uncertainty",
    "Market competition affects margins",
]
ndcg = calculate_ndcg_at_k(predicted, golden, k=5)
print(f"NDCG@5: {ndcg:.4f}")
print(f"Target: >0.70")
print(f"✓ PASS" if ndcg > 0.70 else "⚠ Below target")

Test 2.2: Similar content, good ranking
NDCG@5: 1.0000
Target: >0.70
✓ PASS


In [27]:
# Test 2.3: Real golden dataset example
print("Test 2.3: Using actual AMT golden risks")

# Get AMT (American Tower) golden risks
amt_risks = df[df['ticker'] == 'AMT'].sort_values('risk_rank')
golden_ranking = amt_risks['risk_description'].tolist()

print(f"\nGolden ranking for AMT:")
for i, risk in enumerate(golden_ranking, 1):
    print(f"{i}. {risk[:80]}...")

# Simulate predicted ranking (slightly different order)
predicted_ranking = [
    "Tenant concentration risk from major carriers",
    "International operations face currency and regulatory risks",
    "Competition from other tower companies and new technologies",
    "High debt levels create refinancing and interest rate risk",
    "Wireless technology changes may reduce tower demand",
]

ndcg = calculate_ndcg_at_k(predicted_ranking, golden_ranking, k=5)
print(f"\nNDCG@5: {ndcg:.4f}")
print(f"Target: >0.70")
print(f"✓ PASS" if ndcg > 0.70 else f"⚠ Below target (expected for test data)")

Test 2.3: Using actual AMT golden risks

Golden ranking for AMT:
1. Substantial revenue from small number of wireless carriers (AT&T, Verizon, T-Mob...
2. Operates in 25+ countries with revenues in multiple currencies...
3. Carriers consolidating or sharing tower infrastructure...
4. New technologies (satellite, small cells, future 6G) may reduce tower demand...
5. Leases land under many towers rather than owning it...

NDCG@5: 1.0000
Target: >0.70
✓ PASS


In [28]:
golden_ranking

['Substantial revenue from small number of wireless carriers (AT&T, Verizon, T-Mobile)',
 'Operates in 25+ countries with revenues in multiple currencies',
 'Carriers consolidating or sharing tower infrastructure',
 'New technologies (satellite, small cells, future 6G) may reduce tower demand',
 'Leases land under many towers rather than owning it']

In [31]:
predicted_ranking

['Tenant concentration risk from major carriers',
 'International operations face currency and regulatory risks',
 'Competition from other tower companies and new technologies',
 'High debt levels create refinancing and interest rate risk',
 'Wireless technology changes may reduce tower demand']

In [30]:
gold = sim_calc.model.encode(golden_ranking)
pred = sim_calc.model.encode(predicted_ranking)
cos_sim = cosine_similarity(pred,gold)
cos_sim

array([[0.3411057 , 0.18880501, 0.44427228, 0.28232303, 0.45498982],
       [0.20950867, 0.5786177 , 0.22136717, 0.04358912, 0.16469027],
       [0.43163162, 0.22176574, 0.62819123, 0.60248256, 0.4119779 ],
       [0.10354473, 0.08538782, 0.05507487, 0.12418766, 0.18801267],
       [0.46605054, 0.06472837, 0.5198926 , 0.7498628 , 0.33226296]],
      dtype=float32)

## Test 3: Sector-Specificity

In [None]:
# Prepare sector-specific risks from golden dataset
print("Preparing sector data for specificity testing...")

# Group risks by sector
all_sectors_risks = {}
for sector in df['sector'].unique():
    sector_risks = df[df['sector'] == sector]['risk_description'].tolist()
    all_sectors_risks[sector] = sector_risks

print(f"Loaded {len(all_sectors_risks)} sectors:")
for sector, risks in all_sectors_risks.items():
    print(f"  - {sector}: {len(risks)} risks")

In [None]:
# Test 3.1: Sector-specific risk
print("Test 3.1: Tower-specific risk")
risk = "5G infrastructure deployment requires significant tower upgrades and capital investment"
sector = "Infrastructure/Towers"

score = calculate_sector_specificity(risk, sector, all_sectors_risks)
print(f"Risk: {risk}")
print(f"Sector: {sector}")
print(f"Specificity Score: {score:.4f}")
print(f"✓ PASS (sector-specific)" if score > 0.50 else "⚠ Lower than expected")

In [None]:
# Test 3.2: Generic risk
print("Test 3.2: Generic economic risk")
risk = "Economic recession may negatively impact overall business performance and revenues"
sector = "Industrial/Logistics"

score = calculate_sector_specificity(risk, sector, all_sectors_risks)
print(f"Risk: {risk}")
print(f"Sector: {sector}")
print(f"Specificity Score: {score:.4f}")
print(f"✓ PASS (generic, lower score)" if score < 0.60 else "⚠ Higher than expected")

In [None]:
# Test 3.3: Compare specificity across different sectors
print("Test 3.3: Same risk tested against different sectors\n")

test_risks = [
    "E-commerce growth drives demand for warehouse and distribution facilities",
    "Medicare reimbursement rate changes affect healthcare facility operators",
    "Data center power and cooling costs continue to rise",
]

for risk_text in test_risks:
    print(f"Risk: {risk_text[:70]}...")
    scores = {}
    for sector in all_sectors_risks.keys():
        score = calculate_sector_specificity(risk_text, sector, all_sectors_risks)
        scores[sector] = score
    
    # Sort by score descending
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    print("  Sector specificity scores:")
    for sector, score in sorted_scores:
        print(f"    {sector:30s}: {score:.4f}")
    print()

## Test 4: Complete Evaluation

In [None]:
# Test 4.1: Evaluate complete summary
print("Test 4.1: Complete evaluation with all three metrics\n")

# Use PLD golden data
pld_data = df[df['ticker'] == 'PLD'].sort_values('risk_rank')
golden_risks_pld = pld_data['risk_description'].tolist()
sector_pld = pld_data['sector'].iloc[0]

print(f"Ticker: PLD")
print(f"Sector: {sector_pld}")
print(f"Golden risks: {len(golden_risks_pld)}\n")

# Simulate predicted risks (slightly rephrased)
predicted_risks_pld = [
    "Economic conditions impact demand for industrial real estate",
    "Competition from other logistics REIT developers",
    "Rising interest rates increase debt service costs",
    "E-commerce trends drive warehouse space requirements",
    "Supply chain disruptions affect tenant operations",
]

metrics = evaluate_summary(
    generated_risks=predicted_risks_pld,
    golden_risks=golden_risks_pld,
    sector=sector_pld,
    all_sectors_risks=all_sectors_risks,
)

print("Evaluation Results:")
print(f"  Semantic Similarity: {metrics['semantic_similarity']:.4f} (target >0.75)")
print(f"  NDCG@5:             {metrics['ndcg_at_5']:.4f} (target >0.70)")
print(f"  Sector-Specificity: {metrics['sector_specificity']:.4f} (target >0.40)")

# Check if targets met
targets_met = (
    metrics['semantic_similarity'] > 0.75 and
    metrics['ndcg_at_5'] > 0.70 and
    metrics['sector_specificity'] > 0.40
)
print(f"\n{'✓ All targets met!' if targets_met else '⚠ Some targets not met (expected for simulated data)'}")

In [None]:
# Test 4.2: Perfect summary (using golden as predicted)
print("Test 4.2: Perfect match (golden risks as predicted)\n")

metrics_perfect = evaluate_summary(
    generated_risks=golden_risks_pld,
    golden_risks=golden_risks_pld,
    sector=sector_pld,
    all_sectors_risks=all_sectors_risks,
)

print("Evaluation Results (Perfect Match):")
print(f"  Semantic Similarity: {metrics_perfect['semantic_similarity']:.4f}")
print(f"  NDCG@5:             {metrics_perfect['ndcg_at_5']:.4f}")
print(f"  Sector-Specificity: {metrics_perfect['sector_specificity']:.4f}")

# Should all be very high
print(f"\n✓ Similarity and NDCG near 1.0 as expected")

In [None]:
# Test 4.3: Evaluate multiple tickers
print("Test 4.3: Batch evaluation across multiple tickers\n")

results = []
for ticker in df['ticker'].unique()[:5]:  # Test first 5 tickers
    ticker_data = df[df['ticker'] == ticker].sort_values('risk_rank')
    golden = ticker_data['risk_description'].tolist()
    sector = ticker_data['sector'].iloc[0]
    
    # Use golden as predicted for this test
    metrics = evaluate_summary(
        generated_risks=golden,
        golden_risks=golden,
        sector=sector,
        all_sectors_risks=all_sectors_risks,
    )
    
    results.append({
        'ticker': ticker,
        'sector': sector,
        **metrics
    })

results_df = pd.DataFrame(results)
print("Results for perfect matches (golden = predicted):")
print(results_df.to_string(index=False))

print(f"\nAverage Metrics:")
print(f"  Semantic Similarity: {results_df['semantic_similarity'].mean():.4f}")
print(f"  NDCG@5:             {results_df['ndcg_at_5'].mean():.4f}")
print(f"  Sector-Specificity: {results_df['sector_specificity'].mean():.4f}")

## Test 5: Model Caching

In [None]:
# Test 5.1: Verify model is cached
print("Test 5.1: Model caching verification\n")

model1 = get_embedding_model()
model2 = get_embedding_model()

print(f"Model 1 ID: {id(model1)}")
print(f"Model 2 ID: {id(model2)}")
print(f"Same instance: {model1 is model2}")
print(f"\n✓ PASS: Model is cached globally" if model1 is model2 else "✗ FAIL: Models are different instances")

In [None]:
# Test 5.2: Performance comparison
import time

print("Test 5.2: Performance with cached model\n")

test_risks = [
    "Interest rate fluctuations impact refinancing",
    "Tenant concentration creates revenue dependency",
    "Economic conditions affect property demand",
]

# Warm up
_ = sim_calc.calculate_similarity(test_risks[:1], test_risks[:1])

# Time multiple calculations
start = time.time()
for _ in range(10):
    score = sim_calc.calculate_similarity(test_risks, test_risks)
elapsed = time.time() - start

print(f"10 similarity calculations: {elapsed:.3f} seconds")
print(f"Average per calculation: {elapsed/10:.4f} seconds")
print(f"\n✓ Fast calculations with cached model")

## Summary

### Metrics Implementation Status

✅ **Semantic Similarity**
- Uses sentence-transformers (all-MiniLM-L6-v2)
- Cosine similarity with best-match scoring
- Target: >0.75

✅ **NDCG@5**
- Semantic relevance scoring (not token matching)
- Measures ranking quality of top 5 risks
- Target: >0.70

✅ **Sector-Specificity**
- Embedding-based comparison across sectors
- Detects generic vs. sector-specific risks
- Target: >0.40

### Next Steps
- Integrate metrics into evaluation runner
- Run full evaluation on all 10 golden dataset REITs
- Generate evaluation reports with metric scores