In [1]:
"""Sets the notebook to the project root"""

import sys, os
PROJECT_NAME = "analysis"
current_directory = os.getcwd().split("/")
while current_directory[-1] != PROJECT_NAME:
    current_directory = current_directory[:-1]
if len(current_directory) > 1:
    project_root = "/".join(current_directory)
    os.chdir(project_root)
else:
    raise Exception("ERROR: Project root not found")

from dotenv import load_dotenv
load_dotenv(dotenv_path="./env/env.env")

True

In [2]:
"""Initializes the database connection"""

import os
from src.classes.mongo_client import initialize

# Get MongoDB connection strings from environment variables
connection_strings = {
    "stg": os.getenv("MONGO_STAGE_URI"),
    "prd": os.getenv("MONGO_PROD_URI")
}

# Initialize database connections
dbs = initialize(connection_strings)

print(f"Connected to {len(dbs)} databases:")
for db_name in dbs.keys():
    print(f"- {db_name}")

# Check the connection
print("\nDatabase connections initialized successfully!")
print(f"Staging DB: {dbs["stg"]}")
print(f"Production DB: {dbs["prd"]}")


Connected to 2 databases:
- stg
- prd

Database connections initialized successfully!
Staging DB: MongoClient(host=['landmarksid-staging-shard-00-01-me4vi.mongodb.net:27017', 'landmarksid-staging-shard-00-02-me4vi.mongodb.net:27017', 'landmarksid-staging-shard-00-00-me4vi.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='landmarksid-staging-shard-0', tls=True)
Production DB: MongoClient(host=['production-shard-00-01.gw4dh.mongodb.net:27017', 'production-shard-00-02.gw4dh.mongodb.net:27017', 'production-shard-00-00.gw4dh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-4v0y1q-shard-0', tls=True)


In [6]:
import datetime
from bson import ObjectId


dbs["stg"]["poinetwork"]["sitecards"].find_one()



example_object = {'_id': ObjectId('6838f84cb9e439bacda22d8a'),
 'processed_data': {'site_name': 'Not found',
  'panel_id': 'Not found',
  'address': 'Not found',
  'suburb': 'Not found',
  'postcode': 'Not found',
  'coordinates': 'Not found',
  'PointOfInterest': 'Roadside',
  'IndoorOrOutdoor': 'Outdoor',
  'StaticOrDigital': 'Digital',
  'mediaOwner': 'Go Media'},
 'metadata': {'media_owner': 'Go',
  'processed_at': datetime.datetime(2025, 5, 28, 10, 58, 43, 735000),
  'filename': 'Go_Media-_ALL_DIGI_page_1.png',
  'model_used': 'gemini-1.5-pro-latest',
  'gcs_url': 'gs://media-owner-sitecards/sitecards/Go_Media-_ALL_DIGI_page_1.png'},
 'reviewed': {'image2txt_reviewed_by': None,
  'image2txt_reviewed_datetime': None,
  'poinetwork_match_reviewed_by': None,
  'poinetwork_match_reviewed_datetime': None,
  'poinetwork_match_successful_candidate_id': None,
  'review_notes': None},
 'last_poi_candidate_update': datetime.datetime(2025, 6, 3, 4, 48, 8, 482000),
 'poi_candidates': []}


In [12]:
"""Summary Statistics Analysis for Sitecards Collection"""

import pandas as pd
from collections import defaultdict

# Get all documents from the sitecards collection
collection = dbs["stg"]["poinetwork"]["sitecards"]
documents = list(collection.find())

print(f"Total documents in collection: {len(documents)}")

# Initialize data structure for analysis
stats_by_media_owner = defaultdict(lambda: {
    'total_count': 0,
    'with_successful_match': 0,
    'with_candidates_no_match': 0,
    'no_candidates': 0
})

# Process each document
for doc in documents:
    # Extract media owner
    media_owner = doc.get('processed_data', {}).get('mediaOwner', 'Unknown')
    
    # Get review data
    reviewed = doc.get('reviewed', {})
    successful_candidate_id = reviewed.get('poinetwork_match_successful_candidate_id')
    
    # Get poi candidates
    poi_candidates = doc.get('poi_candidates', [])
    candidates_count = len(poi_candidates)
    
    # Update counts
    stats_by_media_owner[media_owner]['total_count'] += 1
    
    # Categorize the document
    if successful_candidate_id is not None:
        stats_by_media_owner[media_owner]['with_successful_match'] += 1
    elif candidates_count > 0:
        stats_by_media_owner[media_owner]['with_candidates_no_match'] += 1
    else:
        stats_by_media_owner[media_owner]['no_candidates'] += 1

# Convert to DataFrame for better display
results = []
for media_owner, stats in stats_by_media_owner.items():
    total = stats['total_count']
    
    results.append({
        ('Media Owner', ''): media_owner,
        ('Total POIs', '#'): total,
        ('Matched to network', '#'): stats['with_successful_match'],
        ('Matched to network', '%'): round((stats['with_successful_match'] / total) * 100, 2) if total > 0 else 0,
        ('Has candidates but not matched', '#'): stats['with_candidates_no_match'],
        ('Has candidates but not matched', '%'): round((stats['with_candidates_no_match'] / total) * 100, 2) if total > 0 else 0,
        ('No Candidates', '#'): stats['no_candidates'],
        ('No Candidates', '%'): round((stats['no_candidates'] / total) * 100, 2) if total > 0 else 0
    })

# Create DataFrame with MultiIndex columns and sort by total count
df = pd.DataFrame(results)
df.columns = pd.MultiIndex.from_tuples(df.columns)
df = df.sort_values(('Total POIs', '#'), ascending=False)

print("\n" + "="*100)
print("SUMMARY STATISTICS BY MEDIA OWNER")
print("="*100)
print(df.to_string(index=False))

# Overall totals
total_docs = sum(stats['total_count'] for stats in stats_by_media_owner.values())
total_with_match = sum(stats['with_successful_match'] for stats in stats_by_media_owner.values())
total_with_candidates_no_match = sum(stats['with_candidates_no_match'] for stats in stats_by_media_owner.values())
total_no_candidates = sum(stats['no_candidates'] for stats in stats_by_media_owner.values())

print("\n" + "="*100)
print("OVERALL TOTALS")
print("="*100)
print(f"Total POIs: {total_docs}")
print(f"Successfully matched to poiNetwork: {total_with_match} ({round((total_with_match/total_docs)*100, 2)}%)")
print(f"Has candidates but not matched to poiNetwork: {total_with_candidates_no_match} ({round((total_with_candidates_no_match/total_docs)*100, 2)}%)")
print(f"No candidates: {total_no_candidates} ({round((total_no_candidates/total_docs)*100, 2)}%)")


Total documents in collection: 1645

SUMMARY STATISTICS BY MEDIA OWNER
           Media Owner  Total POIs  Matched to network  Matched to network %  Has candidates but not matched  Has candidates but not matched %  No Candidates  No Candidates %
        JCDecaux (JCD)         547                 466                 85.19                               4                              0.73             77            14.08
            MediaWorks         378                 346                 91.53                               5                              1.32             27             7.14
              Go Media         293                 259                 88.40                               0                              0.00             34            11.60
Outdoor Network (ONET)         258                   0                  0.00                              46                             17.83            212            82.17
                  LUMO          77                  56