# Matching Review Notebook

Use this notebook to review ambiguous matches from the pipeline's review queue
and create manual mapping overrides.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
from pathlib import Path

YEAR = 2026
DATA_DIR = Path('../data')

## Load Review Queue

In [None]:
review = pd.read_csv(DATA_DIR / 'matched' / f'review_queue_{YEAR}.csv')
print(f'Review queue: {len(review)} records')
review.head(20)

## Load Master Matched Table

In [None]:
master = pd.read_csv(DATA_DIR / 'matched' / f'master_matched_{YEAR}.csv')
print(f'Master table: {len(master)} records')
master[['source_id', 'building_name_raw', 'address_raw', 'match_confidence', 'match_method']].head(20)

## Match Confidence Distribution

In [None]:
master['match_confidence'].describe()

In [None]:
master['match_method'].value_counts()

## Review Low-Confidence Matches

Look at matches below confidence 70 to decide if they should be kept or rejected.

In [None]:
low_conf = master[master['match_confidence'] < 70].sort_values('match_confidence')
print(f'{len(low_conf)} low-confidence matches')
low_conf[['source_id', 'building_name_raw', 'address_raw', 'energy_grade',
           'match_confidence', 'match_method', 'match_notes']].head(30)

## Create Manual Mapping

After reviewing, create decisions for ambiguous matches.
Edit the cells below to build your manual mapping.

In [None]:
# Example manual mapping decisions
# Uncomment and edit:

# manual_mapping = pd.DataFrame([
#     {'leed_source_id': 'LEED_12345', 'nyc_source_id': 'NYC_67890', 'decision': 'match', 'notes': 'Confirmed via Google Maps'},
#     {'leed_source_id': 'LEED_11111', 'nyc_source_id': '', 'decision': 'reject', 'notes': 'Building demolished'},
# ])
# manual_mapping.to_csv(DATA_DIR / 'interim' / 'manual_mapping.csv', index=False)
# print(f'Saved {len(manual_mapping)} manual decisions')