In [2]:
"""Sets the notebook to the project root"""

import sys, os
PROJECT_NAME = "analysis"
current_directory = os.getcwd().split("/")
while current_directory[-1] != PROJECT_NAME:
    current_directory = current_directory[:-1]
if len(current_directory) > 1:
    project_root = "/".join(current_directory)
    os.chdir(project_root)
else:
    raise Exception("ERROR: Project root not found")

from dotenv import load_dotenv
load_dotenv(dotenv_path="./env/env.env")

True

In [3]:
"""Initializes the database connection"""

import os
from bson import ObjectId
from src.classes.mongo_client import initialize

# Get MongoDB connection strings from environment variables
connection_strings = {
    "stg": os.getenv("MONGO_STAGE_URI"),
    "prd": os.getenv("MONGO_PROD_URI")
}

# Initialize database connections
dbs = initialize(connection_strings)



Database connections initialized successfully!
Staging DB: MongoClient(host=['landmarksid-staging-shard-00-01-me4vi.mongodb.net:27017', 'landmarksid-staging-shard-00-02-me4vi.mongodb.net:27017', 'landmarksid-staging-shard-00-00-me4vi.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='landmarksid-staging-shard-0', tls=True)
Production DB: MongoClient(host=['production-shard-00-02.gw4dh.mongodb.net:27017', 'production-shard-00-01.gw4dh.mongodb.net:27017', 'production-shard-00-00.gw4dh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-4v0y1q-shard-0', tls=True)


In [None]:


sitecards = list(dbs["stg"]["poinetwork"]["sitecards"].find())


# example_sitecard
"""
{
  "_id": {
    "$oid": "6838f84cb9e439bacda22d8a"
  },
  "processed_data": {
    "site_name": "Not found",
    "panel_id": "Not found",
    "address": "Not found",
    "suburb": "Not found",
    "postcode": "Not found",
    "coordinates": "Not found",
    "PointOfInterest": "Roadside",
    "IndoorOrOutdoor": "Outdoor",
    "StaticOrDigital": "Digital",
    "mediaOwner": "Go Media"
  },
  "metadata": {
    "media_owner": "Go",
    "processed_at": {
      "$date": "2025-05-28T10:58:43.735Z"
    },
    "filename": "Go_Media-_ALL_DIGI_page_1.png",
    "model_used": "gemini-1.5-pro-latest",
    "gcs_url": "gs://media-owner-sitecards/sitecards/Go_Media-_ALL_DIGI_page_1.png"
  },
  "reviewed": {
    "image2txt_reviewed_by": null,
    "image2txt_reviewed_datetime": null,
    "poinetwork_match_reviewed_by": null,
    "poinetwork_match_reviewed_datetime": null,
    "poinetwork_match_successful_candidate_id": null,
    "review_notes": null
  },
  "last_poi_candidate_update": {
    "$date": "2025-06-03T04:48:08.482Z"
  },
  "poi_candidates": []
}
"""



In [8]:
# Analysis: Sitecards with candidates pending review, grouped by media owner

import pandas as pd
from collections import defaultdict

sitecards = list(dbs["stg"]["poinetwork"]["sitecards"].find())

# Filter sitecards that:
# 1. Have POI candidates (non-empty poi_candidates array)
# 2. Have NOT been reviewed (poinetwork_match_reviewed_by is null AND review_notes is null)
pending_review_sitecards = []

for sitecard in sitecards:
    # Check if sitecard has candidates
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    # Check if sitecard has NOT been reviewed
    reviewed = sitecard.get('reviewed', {})
    not_reviewed = (
        reviewed.get('poinetwork_match_reviewed_by') is None and
        reviewed.get('review_notes') is None
    )
    
    if has_candidates and not_reviewed:
        pending_review_sitecards.append(sitecard)

print(f"Total sitecards: {len(sitecards)}")
print(f"Sitecards with candidates pending review: {len(pending_review_sitecards)}")
print()

# Group by media owner
media_owner_metrics = defaultdict(lambda: {
    'count': 0,
    'sitecard_ids': [],
    'filenames': []
})

for sitecard in pending_review_sitecards:
    media_owner = sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')
    media_owner_metrics[media_owner]['count'] += 1
    media_owner_metrics[media_owner]['sitecard_ids'].append(str(sitecard['_id']))
    
    filename = sitecard.get('metadata', {}).get('filename', 'Unknown')
    media_owner_metrics[media_owner]['filenames'].append(filename)

# Display metrics by media owner
print("=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===")
print()

total_pending = 0
for media_owner, metrics in sorted(media_owner_metrics.items()):
    count = metrics['count']
    total_pending += count
    
    print(f"📊 {media_owner}: {count} sitecards pending review")
    print(f"   Sample IDs: {metrics['sitecard_ids'][:3]}{'...' if len(metrics['sitecard_ids']) > 3 else ''}")
    print()

print(f"🔢 TOTAL PENDING REVIEW: {total_pending}")
print()

# Create a summary DataFrame for easy visualization
summary_data = []
for media_owner, metrics in media_owner_metrics.items():
    summary_data.append({
        'media_owner': media_owner,
        'pending_review_count': metrics['count'],
        'percentage_of_pending': round((metrics['count'] / total_pending * 100), 1) if total_pending > 0 else 0
    })

print("📈 SUMMARY TABLE:")
if summary_data:
    df_summary = pd.DataFrame(summary_data).sort_values('pending_review_count', ascending=False)
    print(df_summary.to_string(index=False))
else:
    print("✅ No pending reviews - all sitecards with candidates have been reviewed!")
print()

# Calculate completion percentage
sitecards_with_candidates = len([sc for sc in sitecards if 'poi_candidates' in sc and len(sc['poi_candidates']) > 0])
sitecards_reviewed = len([sc for sc in sitecards if sc.get('reviewed', {}).get('poinetwork_match_reviewed_by') is not None])

# Completion rate = (sitecards with candidates that have been reviewed) / (total sitecards with candidates) * 100
completed_with_candidates = sitecards_with_candidates - total_pending
completion_percentage = (completed_with_candidates / sitecards_with_candidates * 100) if sitecards_with_candidates > 0 else 0

print("=== COMPLETION METRICS ===")
print(f"🎯 COMPLETION RATE: {completion_percentage:.2f}%")
print(f"   ({completed_with_candidates:,} completed out of {sitecards_with_candidates:,} sitecards with candidates)")
print()

# Debug the discrepancy - find sitecards with candidates but not fully reviewed
debug_sitecards = []
for sitecard in sitecards:
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    reviewed = sitecard.get('reviewed', {})
    has_reviewer = reviewed.get('poinetwork_match_reviewed_by') is not None
    has_notes = reviewed.get('review_notes') is not None
    
    if has_candidates and not has_reviewer:
        debug_sitecards.append({
            'id': str(sitecard['_id']),
            'media_owner': sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown'),
            'has_reviewer': has_reviewer,
            'has_notes': has_notes,
            'reviewer': reviewed.get('poinetwork_match_reviewed_by'),
            'notes': reviewed.get('review_notes')
        })

print("=== DEBUG: SITECARDS WITH CANDIDATES BUT NO REVIEWER ===")
for debug in debug_sitecards:
    print(f"ID: {debug['id']}")
    print(f"   Media Owner: {debug['media_owner']}")
    print(f"   Has Reviewer: {debug['has_reviewer']}")
    print(f"   Has Notes: {debug['has_notes']}")
    print(f"   Reviewer Value: {debug['reviewer']}")
    print(f"   Notes Value: {debug['notes']}")
    print()

# Additional insights
print("=== ADDITIONAL INSIGHTS ===")
print(f"• {sitecards_with_candidates:,} total sitecards have candidates")
print(f"• {sitecards_reviewed:,} total sitecards have been reviewed")
print(f"• {len(debug_sitecards)} sitecards with candidates have no reviewer assigned")
print(f"• {round((total_pending / len(sitecards) * 100), 1)}% of all sitecards have candidates but are pending review")
print()

# Detailed list of sitecards that still need review
print("=== SITECARDS THAT STILL NEED REVIEW ===")
if pending_review_sitecards:
    for i, sitecard in enumerate(pending_review_sitecards, 1):
        print(f"🔍 #{i}")
        print(f"   ID: {sitecard['_id']}")
        print(f"   Media Owner: {sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')}")
        print(f"   Filename: {sitecard.get('metadata', {}).get('filename', 'Unknown')}")
        print(f"   Candidates Count: {len(sitecard.get('poi_candidates', []))}")
        print(f"   Last Update: {sitecard.get('last_poi_candidate_update', 'Unknown')}")
        
        # Show some processed data
        processed_data = sitecard.get('processed_data', {})
        print(f"   Site Name: {processed_data.get('site_name', 'Unknown')}")
        print(f"   Address: {processed_data.get('address', 'Unknown')}")
        print(f"   POI Type: {processed_data.get('PointOfInterest', 'Unknown')}")
        
        # Show first few candidates if available
        candidates = sitecard.get('poi_candidates', [])
        if candidates:
            print(f"   Sample Candidate IDs: {[str(c.get('_id', 'Unknown')) for c in candidates[:3]]}")
        
        print(f"   GCS URL: {sitecard.get('metadata', {}).get('gcs_url', 'Unknown')}")
        print()
else:
    print("✅ All sitecards with candidates have been reviewed!")

Total sitecards: 1645
Sitecards with candidates pending review: 0

=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===

🔢 TOTAL PENDING REVIEW: 0

📈 SUMMARY TABLE:
✅ No pending reviews - all sitecards with candidates have been reviewed!

=== COMPLETION METRICS ===
🎯 COMPLETION RATE: 100.00%
   (1,303 completed out of 1,303 sitecards with candidates)

=== DEBUG: SITECARDS WITH CANDIDATES BUT NO REVIEWER ===
ID: 683e9e3511492ca8aea8e0c4
   Media Owner: VAST Billboards
   Has Reviewer: False
   Has Notes: True
   Reviewer Value: None
   Notes Value: Not a Site Card

=== ADDITIONAL INSIGHTS ===
• 1,303 total sitecards have candidates
• 1,302 total sitecards have been reviewed
• 1 sitecards with candidates have no reviewer assigned
• 0.0% of all sitecards have candidates but are pending review

=== SITECARDS THAT STILL NEED REVIEW ===
✅ All sitecards with candidates have been reviewed!


In [None]:
# Analysis: Sitecards with candidates pending review, grouped by media owner

import pandas as pd
from collections import defaultdict

sitecards = list(dbs["stg"]["poinetwork"]["sitecards"].find())

# Filter sitecards that:
# 1. Have POI candidates (non-empty poi_candidates array)
# 2. Have NOT been reviewed (poinetwork_match_reviewed_by is null AND review_notes is null)
pending_review_sitecards = []

for sitecard in sitecards:
    # Check if sitecard has candidates
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    # Check if sitecard has NOT been reviewed
    reviewed = sitecard.get('reviewed', {})
    not_reviewed = (
        reviewed.get('poinetwork_match_reviewed_by') is None and
        reviewed.get('review_notes') is None
    )
    
    if has_candidates and not_reviewed:
        pending_review_sitecards.append(sitecard)

print(f"Total sitecards: {len(sitecards)}")
print(f"Sitecards with candidates pending review: {len(pending_review_sitecards)}")
print()

# Group by media owner
media_owner_metrics = defaultdict(lambda: {
    'count': 0,
    'sitecard_ids': [],
    'filenames': []
})

for sitecard in pending_review_sitecards:
    media_owner = sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')
    media_owner_metrics[media_owner]['count'] += 1
    media_owner_metrics[media_owner]['sitecard_ids'].append(str(sitecard['_id']))
    
    filename = sitecard.get('metadata', {}).get('filename', 'Unknown')
    media_owner_metrics[media_owner]['filenames'].append(filename)

# Display metrics by media owner
print("=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===")
print()

total_pending = 0
for media_owner, metrics in sorted(media_owner_metrics.items()):
    count = metrics['count']
    total_pending += count
    
    print(f"📊 {media_owner}: {count} sitecards pending review")
    print(f"   Sample IDs: {metrics['sitecard_ids'][:3]}{'...' if len(metrics['sitecard_ids']) > 3 else ''}")
    print()

print(f"🔢 TOTAL PENDING REVIEW: {total_pending}")
print()

# Create a summary DataFrame for easy visualization
summary_data = []
for media_owner, metrics in media_owner_metrics.items():
    summary_data.append({
        'media_owner': media_owner,
        'pending_review_count': metrics['count'],
        'percentage_of_pending': round((metrics['count'] / total_pending * 100), 1) if total_pending > 0 else 0
    })

df_summary = pd.DataFrame(summary_data).sort_values('pending_review_count', ascending=False)
print("📈 SUMMARY TABLE:")
print(df_summary.to_string(index=False))
print()

# Calculate completion percentage
sitecards_with_candidates = len([sc for sc in sitecards if 'poi_candidates' in sc and len(sc['poi_candidates']) > 0])
sitecards_reviewed = len([sc for sc in sitecards if sc.get('reviewed', {}).get('poinetwork_match_reviewed_by') is not None])

# Completion rate = (sitecards with candidates that have been reviewed) / (total sitecards with candidates) * 100
completed_with_candidates = sitecards_with_candidates - total_pending
completion_percentage = (completed_with_candidates / sitecards_with_candidates * 100) if sitecards_with_candidates > 0 else 0

print("=== COMPLETION METRICS ===")
print(f"🎯 COMPLETION RATE: {completion_percentage:.2f}%")
print(f"   ({completed_with_candidates:,} completed out of {sitecards_with_candidates:,} sitecards with candidates)")
print()

# Debug the discrepancy - find sitecards with candidates but not fully reviewed
debug_sitecards = []
for sitecard in sitecards:
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    reviewed = sitecard.get('reviewed', {})
    has_reviewer = reviewed.get('poinetwork_match_reviewed_by') is not None
    has_notes = reviewed.get('review_notes') is not None
    
    if has_candidates and not has_reviewer:
        debug_sitecards.append({
            'id': str(sitecard['_id']),
            'media_owner': sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown'),
            'has_reviewer': has_reviewer,
            'has_notes': has_notes,
            'reviewer': reviewed.get('poinetwork_match_reviewed_by'),
            'notes': reviewed.get('review_notes')
        })

print("=== DEBUG: SITECARDS WITH CANDIDATES BUT NO REVIEWER ===")
for debug in debug_sitecards:
    print(f"ID: {debug['id']}")
    print(f"   Media Owner: {debug['media_owner']}")
    print(f"   Has Reviewer: {debug['has_reviewer']}")
    print(f"   Has Notes: {debug['has_notes']}")
    print(f"   Reviewer Value: {debug['reviewer']}")
    print(f"   Notes Value: {debug['notes']}")
    print()

# Additional insights
print("=== ADDITIONAL INSIGHTS ===")
print(f"• {sitecards_with_candidates:,} total sitecards have candidates")
print(f"• {sitecards_reviewed:,} total sitecards have been reviewed")
print(f"• {len(debug_sitecards)} sitecards with candidates have no reviewer assigned")
print(f"• {round((total_pending / len(sitecards) * 100), 1)}% of all sitecards have candidates but are pending review")
print()

# Detailed list of sitecards that still need review
print("=== SITECARDS THAT STILL NEED REVIEW ===")
if pending_review_sitecards:
    for i, sitecard in enumerate(pending_review_sitecards, 1):
        print(f"🔍 #{i}")
        print(f"   ID: {sitecard['_id']}")
        print(f"   Media Owner: {sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')}")
        print(f"   Filename: {sitecard.get('metadata', {}).get('filename', 'Unknown')}")
        print(f"   Candidates Count: {len(sitecard.get('poi_candidates', []))}")
        print(f"   Last Update: {sitecard.get('last_poi_candidate_update', 'Unknown')}")
        
        # Show some processed data
        processed_data = sitecard.get('processed_data', {})
        print(f"   Site Name: {processed_data.get('site_name', 'Unknown')}")
        print(f"   Address: {processed_data.get('address', 'Unknown')}")
        print(f"   POI Type: {processed_data.get('PointOfInterest', 'Unknown')}")
        
        # Show first few candidates if available
        candidates = sitecard.get('poi_candidates', [])
        if candidates:
            print(f"   Sample Candidate IDs: {[str(c.get('_id', 'Unknown')) for c in candidates[:3]]}")
        
        print(f"   GCS URL: {sitecard.get('metadata', {}).get('gcs_url', 'Unknown')}")
        print()
else:
    print("✅ All sitecards with candidates have been reviewed!")

Total sitecards: 1645
Sitecards with candidates pending review: 0

=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===

🔢 TOTAL PENDING REVIEW: 0



KeyError: 'pending_review_count'

In [None]:
# Analysis: Sitecards with candidates pending review, grouped by media owner

import pandas as pd
from collections import defaultdict

sitecards = list(dbs["stg"]["poinetwork"]["sitecards"].find())

# Filter sitecards that:
# 1. Have POI candidates (non-empty poi_candidates array)
# 2. Have NOT been reviewed (poinetwork_match_reviewed_by is null AND review_notes is null)
pending_review_sitecards = []

for sitecard in sitecards:
    # Check if sitecard has candidates
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    # Check if sitecard has NOT been reviewed
    reviewed = sitecard.get('reviewed', {})
    not_reviewed = (
        reviewed.get('poinetwork_match_reviewed_by') is None and
        reviewed.get('review_notes') is None
    )
    
    if has_candidates and not_reviewed:
        pending_review_sitecards.append(sitecard)

print(f"Total sitecards: {len(sitecards)}")
print(f"Sitecards with candidates pending review: {len(pending_review_sitecards)}")
print()

# Group by media owner
media_owner_metrics = defaultdict(lambda: {
    'count': 0,
    'sitecard_ids': [],
    'filenames': []
})

for sitecard in pending_review_sitecards:
    media_owner = sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')
    media_owner_metrics[media_owner]['count'] += 1
    media_owner_metrics[media_owner]['sitecard_ids'].append(str(sitecard['_id']))
    
    filename = sitecard.get('metadata', {}).get('filename', 'Unknown')
    media_owner_metrics[media_owner]['filenames'].append(filename)

# Display metrics by media owner
print("=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===")
print()

total_pending = 0
for media_owner, metrics in sorted(media_owner_metrics.items()):
    count = metrics['count']
    total_pending += count
    
    print(f"📊 {media_owner}: {count} sitecards pending review")
    print(f"   Sample IDs: {metrics['sitecard_ids'][:3]}{'...' if len(metrics['sitecard_ids']) > 3 else ''}")
    print()

print(f"🔢 TOTAL PENDING REVIEW: {total_pending}")
print()

# Create a summary DataFrame for easy visualization
summary_data = []
for media_owner, metrics in media_owner_metrics.items():
    summary_data.append({
        'media_owner': media_owner,
        'pending_review_count': metrics['count'],
        'percentage_of_pending': round((metrics['count'] / total_pending * 100), 1) if total_pending > 0 else 0
    })

df_summary = pd.DataFrame(summary_data).sort_values('pending_review_count', ascending=False)
print("📈 SUMMARY TABLE:")
print(df_summary.to_string(index=False))
print()

# Calculate completion percentage
sitecards_with_candidates = len([sc for sc in sitecards if 'poi_candidates' in sc and len(sc['poi_candidates']) > 0])
sitecards_reviewed = len([sc for sc in sitecards if sc.get('reviewed', {}).get('poinetwork_match_reviewed_by') is not None])

# Completion rate = (sitecards with candidates that have been reviewed) / (total sitecards with candidates) * 100
completed_with_candidates = sitecards_with_candidates - total_pending
completion_percentage = (completed_with_candidates / sitecards_with_candidates * 100) if sitecards_with_candidates > 0 else 0

print("=== COMPLETION METRICS ===")
print(f"🎯 COMPLETION RATE: {completion_percentage:.2f}%")
print(f"   ({completed_with_candidates:,} completed out of {sitecards_with_candidates:,} sitecards with candidates)")
print()

# Debug the discrepancy - find sitecards with candidates but not fully reviewed
debug_sitecards = []
for sitecard in sitecards:
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    reviewed = sitecard.get('reviewed', {})
    has_reviewer = reviewed.get('poinetwork_match_reviewed_by') is not None
    has_notes = reviewed.get('review_notes') is not None
    
    if has_candidates and not has_reviewer:
        debug_sitecards.append({
            'id': str(sitecard['_id']),
            'media_owner': sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown'),
            'has_reviewer': has_reviewer,
            'has_notes': has_notes,
            'reviewer': reviewed.get('poinetwork_match_reviewed_by'),
            'notes': reviewed.get('review_notes')
        })

print("=== DEBUG: SITECARDS WITH CANDIDATES BUT NO REVIEWER ===")
for debug in debug_sitecards:
    print(f"ID: {debug['id']}")
    print(f"   Media Owner: {debug['media_owner']}")
    print(f"   Has Reviewer: {debug['has_reviewer']}")
    print(f"   Has Notes: {debug['has_notes']}")
    print(f"   Reviewer Value: {debug['reviewer']}")
    print(f"   Notes Value: {debug['notes']}")
    print()

# Additional insights
print("=== ADDITIONAL INSIGHTS ===")
print(f"• {sitecards_with_candidates:,} total sitecards have candidates")
print(f"• {sitecards_reviewed:,} total sitecards have been reviewed")
print(f"• {len(debug_sitecards)} sitecards with candidates have no reviewer assigned")
print(f"• {round((total_pending / len(sitecards) * 100), 1)}% of all sitecards have candidates but are pending review")
print()

# Detailed list of sitecards that still need review
print("=== SITECARDS THAT STILL NEED REVIEW ===")
if pending_review_sitecards:
    for i, sitecard in enumerate(pending_review_sitecards, 1):
        print(f"🔍 #{i}")
        print(f"   ID: {sitecard['_id']}")
        print(f"   Media Owner: {sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')}")
        print(f"   Filename: {sitecard.get('metadata', {}).get('filename', 'Unknown')}")
        print(f"   Candidates Count: {len(sitecard.get('poi_candidates', []))}")
        print(f"   Last Update: {sitecard.get('last_poi_candidate_update', 'Unknown')}")
        
        # Show some processed data
        processed_data = sitecard.get('processed_data', {})
        print(f"   Site Name: {processed_data.get('site_name', 'Unknown')}")
        print(f"   Address: {processed_data.get('address', 'Unknown')}")
        print(f"   POI Type: {processed_data.get('PointOfInterest', 'Unknown')}")
        
        # Show first few candidates if available
        candidates = sitecard.get('poi_candidates', [])
        if candidates:
            print(f"   Sample Candidate IDs: {[str(c.get('_id', 'Unknown')) for c in candidates[:3]]}")
        
        print(f"   GCS URL: {sitecard.get('metadata', {}).get('gcs_url', 'Unknown')}")
        print()
else:
    print("✅ All sitecards with candidates have been reviewed!")

Total sitecards: 1645
Sitecards with candidates pending review: 0

=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===

🔢 TOTAL PENDING REVIEW: 0



KeyError: 'pending_review_count'

In [None]:
# Analysis: Sitecards with candidates pending review, grouped by media owner

import pandas as pd
from collections import defaultdict

sitecards = list(dbs["stg"]["poinetwork"]["sitecards"].find())

# Filter sitecards that:
# 1. Have POI candidates (non-empty poi_candidates array)
# 2. Have NOT been reviewed (poinetwork_match_reviewed_by is null AND review_notes is null)
pending_review_sitecards = []

for sitecard in sitecards:
    # Check if sitecard has candidates
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    # Check if sitecard has NOT been reviewed
    reviewed = sitecard.get('reviewed', {})
    not_reviewed = (
        reviewed.get('poinetwork_match_reviewed_by') is None and
        reviewed.get('review_notes') is None
    )
    
    if has_candidates and not_reviewed:
        pending_review_sitecards.append(sitecard)

print(f"Total sitecards: {len(sitecards)}")
print(f"Sitecards with candidates pending review: {len(pending_review_sitecards)}")
print()

# Group by media owner
media_owner_metrics = defaultdict(lambda: {
    'count': 0,
    'sitecard_ids': [],
    'filenames': []
})

for sitecard in pending_review_sitecards:
    media_owner = sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')
    media_owner_metrics[media_owner]['count'] += 1
    media_owner_metrics[media_owner]['sitecard_ids'].append(str(sitecard['_id']))
    
    filename = sitecard.get('metadata', {}).get('filename', 'Unknown')
    media_owner_metrics[media_owner]['filenames'].append(filename)

# Display metrics by media owner
print("=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===")
print()

total_pending = 0
for media_owner, metrics in sorted(media_owner_metrics.items()):
    count = metrics['count']
    total_pending += count
    
    print(f"📊 {media_owner}: {count} sitecards pending review")
    print(f"   Sample IDs: {metrics['sitecard_ids'][:3]}{'...' if len(metrics['sitecard_ids']) > 3 else ''}")
    print()

print(f"🔢 TOTAL PENDING REVIEW: {total_pending}")
print()

# Create a summary DataFrame for easy visualization
summary_data = []
for media_owner, metrics in media_owner_metrics.items():
    summary_data.append({
        'media_owner': media_owner,
        'pending_review_count': metrics['count'],
        'percentage_of_pending': round((metrics['count'] / total_pending * 100), 1) if total_pending > 0 else 0
    })

df_summary = pd.DataFrame(summary_data).sort_values('pending_review_count', ascending=False)
print("📈 SUMMARY TABLE:")
print(df_summary.to_string(index=False))
print()

# Calculate completion percentage
sitecards_with_candidates = len([sc for sc in sitecards if 'poi_candidates' in sc and len(sc['poi_candidates']) > 0])
sitecards_reviewed = len([sc for sc in sitecards if sc.get('reviewed', {}).get('poinetwork_match_reviewed_by') is not None])

# Completion rate = (sitecards with candidates that have been reviewed) / (total sitecards with candidates) * 100
completed_with_candidates = sitecards_with_candidates - total_pending
completion_percentage = (completed_with_candidates / sitecards_with_candidates * 100) if sitecards_with_candidates > 0 else 0

print("=== COMPLETION METRICS ===")
print(f"🎯 COMPLETION RATE: {completion_percentage:.2f}%")
print(f"   ({completed_with_candidates:,} completed out of {sitecards_with_candidates:,} sitecards with candidates)")
print()

# Debug the discrepancy - find sitecards with candidates but not fully reviewed
debug_sitecards = []
for sitecard in sitecards:
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    reviewed = sitecard.get('reviewed', {})
    has_reviewer = reviewed.get('poinetwork_match_reviewed_by') is not None
    has_notes = reviewed.get('review_notes') is not None
    
    if has_candidates and not has_reviewer:
        debug_sitecards.append({
            'id': str(sitecard['_id']),
            'media_owner': sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown'),
            'has_reviewer': has_reviewer,
            'has_notes': has_notes,
            'reviewer': reviewed.get('poinetwork_match_reviewed_by'),
            'notes': reviewed.get('review_notes')
        })

print("=== DEBUG: SITECARDS WITH CANDIDATES BUT NO REVIEWER ===")
for debug in debug_sitecards:
    print(f"ID: {debug['id']}")
    print(f"   Media Owner: {debug['media_owner']}")
    print(f"   Has Reviewer: {debug['has_reviewer']}")
    print(f"   Has Notes: {debug['has_notes']}")
    print(f"   Reviewer Value: {debug['reviewer']}")
    print(f"   Notes Value: {debug['notes']}")
    print()

# Additional insights
print("=== ADDITIONAL INSIGHTS ===")
print(f"• {sitecards_with_candidates:,} total sitecards have candidates")
print(f"• {sitecards_reviewed:,} total sitecards have been reviewed")
print(f"• {len(debug_sitecards)} sitecards with candidates have no reviewer assigned")
print(f"• {round((total_pending / len(sitecards) * 100), 1)}% of all sitecards have candidates but are pending review")
print()

# Detailed list of sitecards that still need review
print("=== SITECARDS THAT STILL NEED REVIEW ===")
if pending_review_sitecards:
    for i, sitecard in enumerate(pending_review_sitecards, 1):
        print(f"🔍 #{i}")
        print(f"   ID: {sitecard['_id']}")
        print(f"   Media Owner: {sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')}")
        print(f"   Filename: {sitecard.get('metadata', {}).get('filename', 'Unknown')}")
        print(f"   Candidates Count: {len(sitecard.get('poi_candidates', []))}")
        print(f"   Last Update: {sitecard.get('last_poi_candidate_update', 'Unknown')}")
        
        # Show some processed data
        processed_data = sitecard.get('processed_data', {})
        print(f"   Site Name: {processed_data.get('site_name', 'Unknown')}")
        print(f"   Address: {processed_data.get('address', 'Unknown')}")
        print(f"   POI Type: {processed_data.get('PointOfInterest', 'Unknown')}")
        
        # Show first few candidates if available
        candidates = sitecard.get('poi_candidates', [])
        if candidates:
            print(f"   Sample Candidate IDs: {[str(c.get('_id', 'Unknown')) for c in candidates[:3]]}")
        
        print(f"   GCS URL: {sitecard.get('metadata', {}).get('gcs_url', 'Unknown')}")
        print()
else:
    print("✅ All sitecards with candidates have been reviewed!")

Total sitecards: 1645
Sitecards with candidates pending review: 0

=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===

🔢 TOTAL PENDING REVIEW: 0



KeyError: 'pending_review_count'

In [None]:
# Analysis: Sitecards with candidates pending review, grouped by media owner

import pandas as pd
from collections import defaultdict

sitecards = list(dbs["stg"]["poinetwork"]["sitecards"].find())

# Filter sitecards that:
# 1. Have POI candidates (non-empty poi_candidates array)
# 2. Have NOT been reviewed (poinetwork_match_reviewed_by is null AND review_notes is null)
pending_review_sitecards = []

for sitecard in sitecards:
    # Check if sitecard has candidates
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    # Check if sitecard has NOT been reviewed
    reviewed = sitecard.get('reviewed', {})
    not_reviewed = (
        reviewed.get('poinetwork_match_reviewed_by') is None and
        reviewed.get('review_notes') is None
    )
    
    if has_candidates and not_reviewed:
        pending_review_sitecards.append(sitecard)

print(f"Total sitecards: {len(sitecards)}")
print(f"Sitecards with candidates pending review: {len(pending_review_sitecards)}")
print()

# Group by media owner
media_owner_metrics = defaultdict(lambda: {
    'count': 0,
    'sitecard_ids': [],
    'filenames': []
})

for sitecard in pending_review_sitecards:
    media_owner = sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')
    media_owner_metrics[media_owner]['count'] += 1
    media_owner_metrics[media_owner]['sitecard_ids'].append(str(sitecard['_id']))
    
    filename = sitecard.get('metadata', {}).get('filename', 'Unknown')
    media_owner_metrics[media_owner]['filenames'].append(filename)

# Display metrics by media owner
print("=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===")
print()

total_pending = 0
for media_owner, metrics in sorted(media_owner_metrics.items()):
    count = metrics['count']
    total_pending += count
    
    print(f"📊 {media_owner}: {count} sitecards pending review")
    print(f"   Sample IDs: {metrics['sitecard_ids'][:3]}{'...' if len(metrics['sitecard_ids']) > 3 else ''}")
    print()

print(f"🔢 TOTAL PENDING REVIEW: {total_pending}")
print()

# Create a summary DataFrame for easy visualization
summary_data = []
for media_owner, metrics in media_owner_metrics.items():
    summary_data.append({
        'media_owner': media_owner,
        'pending_review_count': metrics['count'],
        'percentage_of_pending': round((metrics['count'] / total_pending * 100), 1) if total_pending > 0 else 0
    })

df_summary = pd.DataFrame(summary_data).sort_values('pending_review_count', ascending=False)
print("📈 SUMMARY TABLE:")
print(df_summary.to_string(index=False))
print()

# Calculate completion percentage
sitecards_with_candidates = len([sc for sc in sitecards if 'poi_candidates' in sc and len(sc['poi_candidates']) > 0])
sitecards_reviewed = len([sc for sc in sitecards if sc.get('reviewed', {}).get('poinetwork_match_reviewed_by') is not None])

# Completion rate = (sitecards with candidates that have been reviewed) / (total sitecards with candidates) * 100
completed_with_candidates = sitecards_with_candidates - total_pending
completion_percentage = (completed_with_candidates / sitecards_with_candidates * 100) if sitecards_with_candidates > 0 else 0

print("=== COMPLETION METRICS ===")
print(f"🎯 COMPLETION RATE: {completion_percentage:.2f}%")
print(f"   ({completed_with_candidates:,} completed out of {sitecards_with_candidates:,} sitecards with candidates)")
print()

# Debug the discrepancy - find sitecards with candidates but not fully reviewed
debug_sitecards = []
for sitecard in sitecards:
    has_candidates = (
        'poi_candidates' in sitecard and 
        sitecard['poi_candidates'] and 
        len(sitecard['poi_candidates']) > 0
    )
    
    reviewed = sitecard.get('reviewed', {})
    has_reviewer = reviewed.get('poinetwork_match_reviewed_by') is not None
    has_notes = reviewed.get('review_notes') is not None
    
    if has_candidates and not has_reviewer:
        debug_sitecards.append({
            'id': str(sitecard['_id']),
            'media_owner': sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown'),
            'has_reviewer': has_reviewer,
            'has_notes': has_notes,
            'reviewer': reviewed.get('poinetwork_match_reviewed_by'),
            'notes': reviewed.get('review_notes')
        })

print("=== DEBUG: SITECARDS WITH CANDIDATES BUT NO REVIEWER ===")
for debug in debug_sitecards:
    print(f"ID: {debug['id']}")
    print(f"   Media Owner: {debug['media_owner']}")
    print(f"   Has Reviewer: {debug['has_reviewer']}")
    print(f"   Has Notes: {debug['has_notes']}")
    print(f"   Reviewer Value: {debug['reviewer']}")
    print(f"   Notes Value: {debug['notes']}")
    print()

# Additional insights
print("=== ADDITIONAL INSIGHTS ===")
print(f"• {sitecards_with_candidates:,} total sitecards have candidates")
print(f"• {sitecards_reviewed:,} total sitecards have been reviewed")
print(f"• {len(debug_sitecards)} sitecards with candidates have no reviewer assigned")
print(f"• {round((total_pending / len(sitecards) * 100), 1)}% of all sitecards have candidates but are pending review")
print()

# Detailed list of sitecards that still need review
print("=== SITECARDS THAT STILL NEED REVIEW ===")
if pending_review_sitecards:
    for i, sitecard in enumerate(pending_review_sitecards, 1):
        print(f"🔍 #{i}")
        print(f"   ID: {sitecard['_id']}")
        print(f"   Media Owner: {sitecard.get('processed_data', {}).get('mediaOwner', 'Unknown')}")
        print(f"   Filename: {sitecard.get('metadata', {}).get('filename', 'Unknown')}")
        print(f"   Candidates Count: {len(sitecard.get('poi_candidates', []))}")
        print(f"   Last Update: {sitecard.get('last_poi_candidate_update', 'Unknown')}")
        
        # Show some processed data
        processed_data = sitecard.get('processed_data', {})
        print(f"   Site Name: {processed_data.get('site_name', 'Unknown')}")
        print(f"   Address: {processed_data.get('address', 'Unknown')}")
        print(f"   POI Type: {processed_data.get('PointOfInterest', 'Unknown')}")
        
        # Show first few candidates if available
        candidates = sitecard.get('poi_candidates', [])
        if candidates:
            print(f"   Sample Candidate IDs: {[str(c.get('_id', 'Unknown')) for c in candidates[:3]]}")
        
        print(f"   GCS URL: {sitecard.get('metadata', {}).get('gcs_url', 'Unknown')}")
        print()
else:
    print("✅ All sitecards with candidates have been reviewed!")

Total sitecards: 1645
Sitecards with candidates pending review: 0

=== SITECARDS WITH CANDIDATES PENDING REVIEW BY MEDIA OWNER ===

🔢 TOTAL PENDING REVIEW: 0



KeyError: 'pending_review_count'