In [1]:
# ==============================================================================
# 9. OPTIMIZED RIGHTMOVE-CRIME-SVI LINKAGE (MINIMIZE DUPLICATE FILES)
# ==============================================================================

print(f"\nüîÑ OPTIMIZED RIGHTMOVE-CRIME-SVI LINKAGE...")
print("=" * 70)
print("üí° STREAMLINED LINKAGE STRATEGY:")
print("   ‚Ä¢ Load Rightmove properties (41k)")
print("   ‚Ä¢ Load SVI collection points (from ArcGIS grid)")
print("   ‚Ä¢ Create nearest-neighbor linkage")
print("   ‚Ä¢ Minimize duplicate files and processing")
print("   ‚Ä¢ Generate single comprehensive dataset")
print("=" * 70)

import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import warnings
warnings.filterwarnings('ignore')

# ==============================================================================
# 9A. LOAD ALL REQUIRED DATASETS (EXISTING FILES ONLY)
# ==============================================================================

print(f"\nüìÇ LOADING EXISTING DATASETS...")

# Load Rightmove properties
try:
    print("üè† Loading Rightmove properties...")
    all_properties = pd.read_csv(r'C:\Users\Jc\Desktop\Dissertation\Code\composite_crime_rates_FINAL_ALL_PROPERTIES.csv')
    
    # Auto-detect coordinate columns
    lat_col = lon_col = None
    for col in all_properties.columns:
        col_lower = col.lower()
        if 'lat' in col_lower and lat_col is None:
            lat_col = col
        elif 'lon' in col_lower and lon_col is None:
            lon_col = col
    
    
    
    print(f"‚úÖ Loaded {len(all_properties):,} valid properties")
    print(f"‚úÖ Using coordinate columns: {lat_col}, {lon_col}")
    
except Exception as e:
    print(f"‚ùå Error loading properties: {e}")
    raise

# Load SVI collection points (from your ArcGIS-compliant generation)
try:
    print("üìç Loading SVI collection points...")
    svi_points = pd.read_csv('svi_collection_points_arcgis_compliant.csv')
    
    # Ensure required columns exist
    required_svi_cols = ['point_id', 'grid_id', 'latitude', 'longitude', 'crime_density', 'sample_priority']
    missing_cols = [col for col in required_svi_cols if col not in svi_points.columns]
    
    if missing_cols:
        print(f"‚ùå Missing SVI columns: {missing_cols}")
        raise KeyError(f"Required SVI columns not found")
    
    print(f"‚úÖ Loaded {len(svi_points):,} SVI collection points")
    
except FileNotFoundError:
    print("‚ùå SVI points file not found!")
    print("üîÑ Please run the ArcGIS-compliant point generation first!")
    raise

# ==============================================================================
# 9B. STREAMLINED NEAREST-NEIGHBOR LINKAGE
# ==============================================================================

def create_optimized_linkage(properties_df, svi_df, prop_lat_col, prop_lon_col):
    """
    Create optimized property-to-SVI linkage with minimal file duplication
    """
    print("\nüîó Creating optimized nearest-neighbor linkage...")
    
    # Constants for coordinate conversion
    LAT_DEGREE_METERS = 111000
    LON_DEGREE_METERS = 85000
    
    # Convert coordinates to meters for accurate distance calculation
    print("üìê Converting coordinates to meters...")
    
    # Property coordinates
    prop_coords_m = np.column_stack([
        properties_df[prop_lat_col].values * LAT_DEGREE_METERS,
        properties_df[prop_lon_col].values * LON_DEGREE_METERS
    ])
    
    # SVI coordinates 
    svi_coords_m = np.column_stack([
        svi_df['latitude'].values * LAT_DEGREE_METERS,
        svi_df['longitude'].values * LON_DEGREE_METERS
    ])
    
    # Build spatial index for efficient nearest-neighbor search
    print("üóÇÔ∏è Building spatial index...")
    svi_tree = cKDTree(svi_coords_m)
    
    # Find nearest SVI point for each property
    print("üéØ Finding nearest SVI points...")
    distances_m, nearest_indices = svi_tree.query(prop_coords_m)
    
    # Create linkage records
    print("üìã Creating linkage records...")
    linkage_records = []
    
    for i, (prop_idx, prop_row) in enumerate(properties_df.iterrows()):
        if i % 5000 == 0:
            print(f"   Processing {i:,}/{len(properties_df):,} properties ({i/len(properties_df)*100:.1f}%)")
        
        # Get nearest SVI point
        nearest_svi_idx = nearest_indices[i]
        nearest_svi = svi_df.iloc[nearest_svi_idx]
        distance_m = distances_m[i]
        distance_km = distance_m / 1000
        
        # Create comprehensive record
        record = {
            # Property identification
            'property_index': prop_idx,
            'property_latitude': prop_row[prop_lat_col],
            'property_longitude': prop_row[prop_lon_col],
            
            # Linked SVI point
            'svi_point_id': nearest_svi['point_id'],
            'svi_grid_id': nearest_svi['grid_id'],
            'svi_latitude': nearest_svi['latitude'],
            'svi_longitude': nearest_svi['longitude'],
            
            # Distance metrics
            'distance_to_svi_m': distance_m,
            'distance_km': distance_km,
            
            # Quality assessment
            'linkage_quality': (
                'excellent' if distance_m <= 500 else 
                'good' if distance_m <= 1000 else
                'fair' if distance_m <= 2000 else 'poor'
            ),
            
            # Crime data from SVI point
            'svi_crime_density': nearest_svi['crime_density'],
            'svi_sample_priority': nearest_svi['sample_priority'],
            
            # Reliability scoring
            'distance_weight': max(0, 1 - (distance_km / 5.0)),  # Weight decreases with distance
            'reliability_score': (
                'very_high' if distance_km <= 0.5 else
                'high' if distance_km <= 1.0 else
                'medium' if distance_km <= 2.0 else 'low'
            )
        }
        
        # Add all property attributes (with prefix to avoid conflicts)
        for col in prop_row.index:
            if col not in [prop_lat_col, prop_lon_col]:
                record[f'property_{col}'] = prop_row[col]
        
        linkage_records.append(record)
    
    return pd.DataFrame(linkage_records)

# ==============================================================================
# 9C. EXECUTE OPTIMIZED LINKAGE
# ==============================================================================

print(f"\nüöÄ EXECUTING OPTIMIZED LINKAGE...")

linkage_df = create_optimized_linkage(all_properties, svi_points, lat_col, lon_col)

print(f"‚úÖ LINKAGE COMPLETE!")

# ==============================================================================
# 9D. ADD CLUSTERING AND SHARING ANALYSIS
# ==============================================================================

print(f"\nüßÆ ADDING CLUSTERING ANALYSIS...")

# Analyze SVI point sharing
svi_sharing_counts = linkage_df['svi_point_id'].value_counts()
linkage_df['svi_cluster_size'] = linkage_df['svi_point_id'].map(svi_sharing_counts)

# Add sharing categories
linkage_df['sharing_category'] = pd.cut(
    linkage_df['svi_cluster_size'], 
    bins=[0, 1, 5, 10, 50, float('inf')], 
    labels=['unique', 'small_group', 'medium_group', 'large_group', 'mega_cluster']
)

print(f"‚úÖ Clustering analysis added!")

# ==============================================================================
# 9E. QUALITY ANALYSIS AND STATISTICS
# ==============================================================================

print(f"\nüìä LINKAGE QUALITY ANALYSIS:")

# Basic statistics
print(f"   üìç Total properties linked: {len(linkage_df):,}")
print(f"   üìç Coverage: 100.0% (all properties)")
print(f"   üéØ Unique SVI points used: {linkage_df['svi_point_id'].nunique():,}")

# Distance statistics
print(f"\nüìè DISTANCE STATISTICS:")
print(f"   Mean distance: {linkage_df['distance_km'].mean():.2f} km")
print(f"   Median distance: {linkage_df['distance_km'].median():.2f} km")
print(f"   Max distance: {linkage_df['distance_km'].max():.2f} km")
print(f"   Min distance: {linkage_df['distance_km'].min():.3f} km")

# Quality distribution
quality_counts = linkage_df['linkage_quality'].value_counts()
print(f"\nüéØ LINKAGE QUALITY DISTRIBUTION:")
for quality, count in quality_counts.items():
    percentage = count / len(linkage_df) * 100
    print(f"   {quality}: {count:,} properties ({percentage:.1f}%)")

# Reliability distribution
reliability_counts = linkage_df['reliability_score'].value_counts()
print(f"\nüéØ RELIABILITY DISTRIBUTION:")
for reliability, count in reliability_counts.items():
    percentage = count / len(linkage_df) * 100
    print(f"   {reliability}: {count:,} properties ({percentage:.1f}%)")

# Sharing analysis
sharing_counts = linkage_df['sharing_category'].value_counts()
print(f"\nüîó SVI POINT SHARING ANALYSIS:")
for category, count in sharing_counts.items():
    percentage = count / len(linkage_df) * 100
    print(f"   {category}: {count:,} properties ({percentage:.1f}%)")

print(f"\nüìä CLUSTER SIZE STATISTICS:")
print(f"   Average properties per SVI: {linkage_df['svi_cluster_size'].mean():.1f}")
print(f"   Max properties per SVI: {linkage_df['svi_cluster_size'].max():,}")
print(f"   SVI points serving 10+ properties: {(svi_sharing_counts >= 10).sum():,}")

# ==============================================================================
# 9F. CREATE IMAGE-PROPERTY MAPPING (OPTIMIZED)
# ==============================================================================

print(f"\nüñºÔ∏è CREATING OPTIMIZED IMAGE-PROPERTY MAPPING...")

# Create comprehensive image-property mapping
image_records = []
directions = ['north', 'east', 'south', 'west']

for _, row in linkage_df.iterrows():
    svi_point_id = row['svi_point_id']
    
    # Create image records for each direction
    for direction in directions:
        image_record = {
            'image_id': f"{svi_point_id}_{direction}",
            'svi_point_id': svi_point_id,
            'direction': direction,
            'property_index': row['property_index'],
            'property_latitude': row['property_latitude'],
            'property_longitude': row['property_longitude'],
            'svi_latitude': row['svi_latitude'],
            'svi_longitude': row['svi_longitude'],
            'distance_km': row['distance_km'],
            'linkage_quality': row['linkage_quality'],
            'reliability_score': row['reliability_score'],
            'distance_weight': row['distance_weight'],
            'svi_crime_density': row['svi_crime_density'],
            'cluster_size': row['svi_cluster_size'],
            'sharing_category': row['sharing_category']
        }
        
        # Add selected property features (avoid duplication)
        key_property_cols = [col for col in row.index if col.startswith('property_') and 
                           any(keyword in col.lower() for keyword in ['price', 'type', 'bed', 'bath', 'sqft', 'year'])]
        
        for col in key_property_cols:
            image_record[col] = row[col]
        
        image_records.append(image_record)

image_mapping_df = pd.DataFrame(image_records)

print(f"‚úÖ Image mapping created: {len(image_mapping_df):,} image-property pairs")

# ==============================================================================
# 9G. SINGLE OPTIMIZED EXPORT (MINIMIZE FILES)
# ==============================================================================

print(f"\nüíæ CREATING SINGLE OPTIMIZED EXPORT...")

# Export main linkage with all data
linkage_df.to_csv('rightmove_svi_crime_linkage_MASTER.csv', index=False)

# Export image mapping for ML/CV work
image_mapping_df.to_csv('image_property_mapping_MASTER.csv', index=False)

# Create high-quality subset for immediate use
high_quality_subset = linkage_df[
    linkage_df['linkage_quality'].isin(['excellent', 'good']) &
    (linkage_df['distance_km'] <= 2.0)
].copy()

high_quality_subset.to_csv('high_quality_linkage_READY.csv', index=False)

# Create unique SVI download list (minimize duplicate downloads)
unique_svi_download = linkage_df[['svi_point_id', 'svi_latitude', 'svi_longitude', 'svi_cluster_size']].drop_duplicates()
unique_svi_download = unique_svi_download.sort_values('svi_cluster_size', ascending=False)

# Add download priority based on cluster size
unique_svi_download['download_priority'] = pd.cut(
    unique_svi_download['svi_cluster_size'],
    bins=[0, 5, 10, 25, float('inf')],
    labels=['low', 'medium', 'high', 'critical']
)

unique_svi_download.to_csv('unique_svi_download_list.csv', index=False)

# Create summary statistics
summary_stats = {
    'total_properties': len(linkage_df),
    'unique_svi_points': linkage_df['svi_point_id'].nunique(),
    'average_distance_km': linkage_df['distance_km'].mean(),
    'median_distance_km': linkage_df['distance_km'].median(),
    'excellent_links': (linkage_df['linkage_quality'] == 'excellent').sum(),
    'good_links': (linkage_df['linkage_quality'] == 'good').sum(),
    'high_reliability': (linkage_df['reliability_score'] == 'very_high').sum(),
    'images_needed': len(unique_svi_download) * 4,  # 4 directions per point
    'estimated_api_cost': len(unique_svi_download) * 4 * 0.007
}

summary_df = pd.DataFrame([summary_stats])
summary_df.to_csv('linkage_summary_FINAL.csv', index=False)

print(f"‚úÖ OPTIMIZED EXPORT COMPLETE!")

# ==============================================================================
# 9H. FINAL SUMMARY
# ==============================================================================

print(f"\nüéØ OPTIMIZED LINKAGE SUMMARY:")
print("=" * 70)
print(f"üìÅ FILES CREATED (MINIMIZED SET):")
print(f"   ‚Ä¢ rightmove_svi_crime_linkage_MASTER.csv - Complete linkage")
print(f"   ‚Ä¢ image_property_mapping_MASTER.csv - For ML/CV training")
print(f"   ‚Ä¢ high_quality_linkage_READY.csv - Immediate use subset")
print(f"   ‚Ä¢ unique_svi_download_list.csv - Optimized download list")
print(f"   ‚Ä¢ linkage_summary_FINAL.csv - Statistics")

print(f"\nüìä KEY STATISTICS:")
print(f"   üìç Properties linked: {len(linkage_df):,}")
print(f"   üñºÔ∏è Unique SVI points needed: {linkage_df['svi_point_id'].nunique():,}")
print(f"   üéØ High-quality links: {(linkage_df['linkage_quality'].isin(['excellent', 'good'])).sum():,}")
print(f"   üìè Average distance: {linkage_df['distance_km'].mean():.2f} km")
print(f"   üí∞ Estimated API cost: ${len(unique_svi_download) * 4 * 0.007:.2f}")

print(f"\nüöÄ OPTIMIZATION ACHIEVED:")
print(f"   ‚úÖ Single comprehensive linkage dataset")
print(f"   ‚úÖ Minimized file duplication")
print(f"   ‚úÖ Optimized download list (no duplicate images)")
print(f"   ‚úÖ Quality-filtered subsets ready")
print(f"   ‚úÖ Complete coverage of all properties")
print("=" * 70)


üîÑ OPTIMIZED RIGHTMOVE-CRIME-SVI LINKAGE...
üí° STREAMLINED LINKAGE STRATEGY:
   ‚Ä¢ Load Rightmove properties (41k)
   ‚Ä¢ Load SVI collection points (from ArcGIS grid)
   ‚Ä¢ Create nearest-neighbor linkage
   ‚Ä¢ Minimize duplicate files and processing
   ‚Ä¢ Generate single comprehensive dataset

üìÇ LOADING EXISTING DATASETS...
üè† Loading Rightmove properties...
‚úÖ Loaded 41,835 valid properties
‚úÖ Using coordinate columns: latitude, longitude
üìç Loading SVI collection points...
‚úÖ Loaded 11,162 SVI collection points

üöÄ EXECUTING OPTIMIZED LINKAGE...

üîó Creating optimized nearest-neighbor linkage...
üìê Converting coordinates to meters...
üóÇÔ∏è Building spatial index...
üéØ Finding nearest SVI points...
üìã Creating linkage records...
   Processing 0/41,835 properties (0.0%)
   Processing 5,000/41,835 properties (12.0%)
   Processing 10,000/41,835 properties (23.9%)
   Processing 15,000/41,835 properties (35.9%)
   Processing 20,000/41,835 properties (47.8%)

In [3]:
# ==============================================================================
# 11. OPTIMIZED FULL-SCALE STREET VIEW DOWNLOADER (NO DUPLICATES)
# ==============================================================================

print(f"\nüì∏ OPTIMIZED FULL-SCALE STREET VIEW DOWNLOADER...")
print("=" * 70)
print("üí° PRODUCTION DOWNLOAD STRATEGY:")
print("   ‚Ä¢ Download ALL unique SVI locations (no duplicates)")
print("   ‚Ä¢ Parallel processing for speed")
print("   ‚Ä¢ No sleep delays (Google API can handle it)")
print("   ‚Ä¢ Progress tracking and resume capability")
print("   ‚Ä¢ Efficient memory management")
print("=" * 70)

import pandas as pd
import requests
import os
import time
from urllib.parse import urlencode
from PIL import Image
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from queue import Queue
import json

# ==============================================================================
# 11A. OPTIMIZED CONFIGURATION
# ==============================================================================

# Google Street View API Configuration
GOOGLE_API_KEY = "AIzaSyArIOaY8fFhKnjxn28mLE3uP4gsby_b_ls"
BASE_URL = "https://maps.googleapis.com/maps/api/streetview"

# Download settings - OPTIMIZED FOR PRODUCTION
COMPOSITE_FOLDER = "street_view_composites_FULL"
IMAGE_SIZE = "256x256"
FINAL_SIZE = (224, 224)
COMPOSITE_SIZE = (448, 448)
FIELD_OF_VIEW = 90
PITCH = 0
WATERMARK_CROP = 25

# Performance settings - OPTIMIZED
MAX_WORKERS = 8  # Parallel downloads
BATCH_SIZE = 50  # Process in batches
TIMEOUT = 15  # Reduced timeout for speed
NO_DELAY = True  # Remove sleep delays

# Progress tracking
PROGRESS_FILE = "download_progress.json"
ERROR_LOG_FILE = "download_errors.log"

print(f"üîß OPTIMIZED CONFIGURATION:")
print(f"   Composite folder: {COMPOSITE_FOLDER}")
print(f"   Max parallel workers: {MAX_WORKERS}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   No delays: {NO_DELAY}")
print(f"   Timeout: {TIMEOUT}s")

# ==============================================================================
# 11B. LOAD UNIQUE LOCATIONS (NO DUPLICATES)
# ==============================================================================

print(f"\nüìÇ LOADING ALL UNIQUE LOCATIONS...")

try:
    # Load the complete image-property mapping
    image_mapping = pd.read_csv('image_property_mapping_MASTER.csv')
    print(f"‚úÖ Loaded image mapping: {len(image_mapping):,} image-property pairs")
    
    # Auto-detect coordinate columns
    lat_col = lon_col = None
    for col in image_mapping.columns:
        col_lower = col.lower()
        if 'svi' in col_lower and 'lat' in col_lower:
            lat_col = col
            break
    
    for col in image_mapping.columns:
        col_lower = col.lower()
        if 'svi' in col_lower and ('lon' in col_lower or 'lng' in col_lower):
            lon_col = col
            break
    
    if lat_col is None or lon_col is None:
        print("‚ùå Could not detect coordinate columns!")
        raise ValueError("Missing coordinate columns")
    
    print(f"üìç Using coordinates: {lat_col}, {lon_col}")
    
    # Get ALL unique SVI locations (no test mode limit)
    location_cols = ['svi_point_id', lat_col, lon_col]
    if 'cluster_size' in image_mapping.columns:
        location_cols.append('cluster_size')
    if 'svi_crime_density' in image_mapping.columns:
        location_cols.append('svi_crime_density')
    
    unique_locations = image_mapping.groupby('svi_point_id')[location_cols[1:]].first().reset_index()
    
    # Sort by cluster size (download high-impact locations first)
    if 'cluster_size' in unique_locations.columns:
        unique_locations = unique_locations.sort_values('cluster_size', ascending=False)
    
    print(f"\nüìä FULL PRODUCTION DOWNLOAD:")
    print(f"   üéØ Unique locations to download: {len(unique_locations):,}")
    print(f"   üñºÔ∏è Total images needed: {len(unique_locations) * 4:,} (4 directions each)")
    print(f"   üí∞ Estimated cost: ${len(unique_locations) * 4 * 0.007:.2f}")
    print(f"   ‚è±Ô∏è Estimated time: {len(unique_locations) * 4 / MAX_WORKERS / 60:.1f} minutes")
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    raise

# ==============================================================================
# 11C. OPTIMIZED DOWNLOAD FUNCTIONS
# ==============================================================================

def create_folders():
    """Create necessary folders"""
    os.makedirs(COMPOSITE_FOLDER, exist_ok=True)
    print(f"üìÅ Created composite folder: {COMPOSITE_FOLDER}")

def load_progress():
    """Load download progress from file"""
    if os.path.exists(PROGRESS_FILE):
        try:
            with open(PROGRESS_FILE, 'r') as f:
                return json.load(f)
        except:
            return {}
    return {}

def save_progress(progress_data):
    """Save download progress to file"""
    with open(PROGRESS_FILE, 'w') as f:
        json.dump(progress_data, f, indent=2)

def log_error(message):
    """Log errors to file"""
    with open(ERROR_LOG_FILE, 'a') as f:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"[{timestamp}] {message}\n")

def get_street_view_url_optimized(latitude, longitude, heading):
    """Generate optimized Google Street View API URL"""
    params = {
        'location': f"{latitude},{longitude}",
        'size': IMAGE_SIZE,
        'heading': heading,
        'pitch': PITCH,
        'fov': FIELD_OF_VIEW,
        'key': GOOGLE_API_KEY
    }
    return f"{BASE_URL}?{urlencode(params)}"

def download_single_image_optimized(latitude, longitude, direction, heading):
    """Download and process a single image with optimized settings"""
    
    url = get_street_view_url_optimized(latitude, longitude, heading)
    
    try:
        # Optimized request with reduced timeout
        response = requests.get(url, timeout=TIMEOUT)
        
        if response.status_code == 200 and len(response.content) > 1000:
            from io import BytesIO
            
            # Process image in memory
            img = Image.open(BytesIO(response.content))
            
            # Crop watermark and resize
            width, height = img.size
            cropped_img = img.crop((0, 0, width, height - WATERMARK_CROP))
            processed_img = cropped_img.resize(FINAL_SIZE, Image.Resampling.LANCZOS)
            
            return {'status': 'success', 'image': processed_img, 'direction': direction}
        else:
            return {'status': 'error', 'message': f'HTTP {response.status_code}', 'direction': direction}
            
    except Exception as e:
        return {'status': 'error', 'message': str(e), 'direction': direction}

def download_location_parallel(location_data):
    """Download all 4 directions for a location using parallel processing"""
    
    svi_point_id, latitude, longitude = location_data
    
    # Skip if composite already exists
    composite_path = os.path.join(COMPOSITE_FOLDER, f"composite_{svi_point_id}.jpg")
    if os.path.exists(composite_path):
        return {
            'svi_point_id': svi_point_id,
            'status': 'skipped',
            'message': 'Composite already exists'
        }
    
    # Direction mappings
    directions = {
        'north': 0,
        'east': 90,
        'south': 180,
        'west': 270
    }
    
    # Download all 4 directions in parallel
    images_dict = {}
    download_results = {}
    
    # Use ThreadPoolExecutor for parallel downloads
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Submit all 4 downloads
        future_to_direction = {
            executor.submit(download_single_image_optimized, latitude, longitude, direction, heading): direction
            for direction, heading in directions.items()
        }
        
        # Collect results
        for future in as_completed(future_to_direction):
            direction = future_to_direction[future]
            try:
                result = future.result()
                download_results[direction] = result
                
                if result['status'] == 'success':
                    images_dict[direction] = result['image']
            except Exception as e:
                download_results[direction] = {'status': 'error', 'message': str(e)}
    
    # Create composite if we have enough images
    if len(images_dict) >= 2:
        composite_result = create_composite_optimized(svi_point_id, images_dict)
        successful_downloads = len([r for r in download_results.values() if r['status'] == 'success'])
        
        return {
            'svi_point_id': svi_point_id,
            'status': 'success',
            'successful_downloads': successful_downloads,
            'total_downloads': 4,
            'composite_created': composite_result['status'] == 'success'
        }
    else:
        error_msg = f"Insufficient images: only {len(images_dict)}/4 downloaded"
        log_error(f"SVI {svi_point_id}: {error_msg}")
        return {
            'svi_point_id': svi_point_id,
            'status': 'error',
            'message': error_msg
        }

def create_composite_optimized(svi_point_id, images_dict):
    """Create composite with optimized processing"""
    
    composite_filename = f"composite_{svi_point_id}.jpg"
    composite_path = os.path.join(COMPOSITE_FOLDER, composite_filename)
    
    try:
        # Create composite canvas
        composite = Image.new('RGB', COMPOSITE_SIZE, (128, 128, 128))
        
        # Position mapping
        positions = {
            'north': (0, 0),
            'east': (224, 0),
            'south': (0, 224),
            'west': (224, 224)
        }
        
        # Place images
        for direction, position in positions.items():
            if direction in images_dict:
                composite.paste(images_dict[direction], position)
        
        # Save with optimized settings
        composite.save(composite_path, quality=90, optimize=True)
        
        return {'status': 'success', 'path': composite_path}
        
    except Exception as e:
        log_error(f"Composite creation failed for SVI {svi_point_id}: {e}")
        return {'status': 'error', 'message': str(e)}

# ==============================================================================
# 11D. MAIN OPTIMIZED DOWNLOAD PIPELINE
# ==============================================================================

def run_optimized_full_download():
    """Execute optimized full-scale download with no duplicates"""
    
    print(f"\nüöÄ STARTING OPTIMIZED FULL DOWNLOAD...")
    
    # Create folders
    create_folders()
    
    # Load existing progress
    progress = load_progress()
    completed_svids = set(progress.get('completed', []))
    
    # Filter out already completed locations
    remaining_locations = []
    for _, row in unique_locations.iterrows():
        if row['svi_point_id'] not in completed_svids:
            remaining_locations.append((
                row['svi_point_id'],
                row[lat_col],
                row[lon_col]
            ))
    
    total_locations = len(unique_locations)
    remaining_count = len(remaining_locations)
    completed_count = total_locations - remaining_count
    
    print(f"\nüìä DOWNLOAD STATUS:")
    print(f"   Total locations: {total_locations:,}")
    print(f"   Already completed: {completed_count:,}")
    print(f"   Remaining: {remaining_count:,}")
    
    if remaining_count == 0:
        print("üéâ All locations already downloaded!")
        return
    
    # Statistics tracking
    start_time = time.time()
    successful_downloads = 0
    failed_downloads = 0
    composites_created = 0
    
    print(f"\nüîÑ PROCESSING {remaining_count:,} LOCATIONS WITH {MAX_WORKERS} PARALLEL WORKERS...")
    
    # Process in batches for better memory management
    for batch_start in range(0, remaining_count, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, remaining_count)
        batch_locations = remaining_locations[batch_start:batch_end]
        batch_num = batch_start // BATCH_SIZE + 1
        total_batches = (remaining_count + BATCH_SIZE - 1) // BATCH_SIZE
        
        print(f"\nüì¶ BATCH {batch_num}/{total_batches} ({len(batch_locations)} locations)")
        
        # Process batch with parallel workers
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            # Submit all locations in batch
            future_to_location = {
                executor.submit(download_location_parallel, location_data): location_data[0]
                for location_data in batch_locations
            }
            
            # Process results as they complete
            batch_results = []
            for future in as_completed(future_to_location):
                svi_id = future_to_location[future]
                try:
                    result = future.result()
                    batch_results.append(result)
                    
                    # Update statistics
                    if result['status'] == 'success':
                        successful_downloads += 1
                        if result.get('composite_created', False):
                            composites_created += 1
                    elif result['status'] != 'skipped':
                        failed_downloads += 1
                    
                    # Update progress
                    completed_svids.add(svi_id)
                    
                    # Print progress
                    total_processed = successful_downloads + failed_downloads + len([r for r in batch_results if r['status'] == 'skipped'])
                    if total_processed % 10 == 0:
                        elapsed = time.time() - start_time
                        rate = total_processed / elapsed if elapsed > 0 else 0
                        eta = (remaining_count - total_processed) / rate / 60 if rate > 0 else 0
                        print(f"   ‚úÖ Processed: {total_processed:,}/{remaining_count:,} ({total_processed/remaining_count*100:.1f}%) | Rate: {rate:.1f}/sec | ETA: {eta:.1f}min")
                
                except Exception as e:
                    log_error(f"Batch processing error for SVI {svi_id}: {e}")
                    failed_downloads += 1
        
        # Save progress after each batch
        progress_data = {
            'completed': list(completed_svids),
            'last_updated': time.strftime("%Y-%m-%d %H:%M:%S"),
            'statistics': {
                'successful_downloads': successful_downloads,
                'failed_downloads': failed_downloads,
                'composites_created': composites_created
            }
        }
        save_progress(progress_data)
        print(f"   üíæ Progress saved (batch {batch_num} complete)")
    
    # Final statistics
    total_time = time.time() - start_time
    total_processed = successful_downloads + failed_downloads
    
    print(f"\nüéØ OPTIMIZED DOWNLOAD COMPLETE!")
    print("=" * 70)
    print(f"‚úÖ FINAL STATISTICS:")
    print(f"   Locations processed: {total_processed:,}")
    print(f"   Successful downloads: {successful_downloads:,}")
    print(f"   Failed downloads: {failed_downloads:,}")
    print(f"   Composites created: {composites_created:,}")
    print(f"   Success rate: {successful_downloads/total_processed*100:.1f}%" if total_processed > 0 else "N/A")
    print(f"   Total time: {total_time/60:.1f} minutes")
    print(f"   Average rate: {total_processed/total_time:.1f} locations/second" if total_time > 0 else "N/A")
    print(f"   Actual cost: ${successful_downloads * 4 * 0.007:.2f}")
    print("=" * 70)
    print(f"üìÅ Composite images: {COMPOSITE_FOLDER}/")
    print(f"üìä Progress file: {PROGRESS_FILE}")
    print(f"üö® Error log: {ERROR_LOG_FILE}")

# ==============================================================================
# 11E. EXECUTE OPTIMIZED FULL DOWNLOAD
# ==============================================================================

print(f"\nüöÄ EXECUTING OPTIMIZED FULL-SCALE DOWNLOAD...")
print(f"‚ö° NO DELAYS | PARALLEL PROCESSING | NO DUPLICATES")

# Confirm before starting
response = input(f"\n‚ö†Ô∏è  About to download {len(unique_locations):,} unique locations ({len(unique_locations)*4:,} images).\nEstimated cost: ${len(unique_locations) * 4 * 0.007:.2f}\nContinue? (y/N): ")

if response.lower() == 'y':
    run_optimized_full_download()
else:
    print("‚ùå Download cancelled by user")


üì∏ OPTIMIZED FULL-SCALE STREET VIEW DOWNLOADER...
üí° PRODUCTION DOWNLOAD STRATEGY:
   ‚Ä¢ Download ALL unique SVI locations (no duplicates)
   ‚Ä¢ Parallel processing for speed
   ‚Ä¢ No sleep delays (Google API can handle it)
   ‚Ä¢ Progress tracking and resume capability
   ‚Ä¢ Efficient memory management
üîß OPTIMIZED CONFIGURATION:
   Composite folder: street_view_composites_FULL
   Max parallel workers: 8
   Batch size: 50
   No delays: True
   Timeout: 15s

üìÇ LOADING ALL UNIQUE LOCATIONS...


‚úÖ Loaded image mapping: 167,340 image-property pairs
üìç Using coordinates: svi_latitude, svi_longitude

üìä FULL PRODUCTION DOWNLOAD:
   üéØ Unique locations to download: 6,204
   üñºÔ∏è Total images needed: 24,816 (4 directions each)
   üí∞ Estimated cost: $173.71
   ‚è±Ô∏è Estimated time: 51.7 minutes

üöÄ EXECUTING OPTIMIZED FULL-SCALE DOWNLOAD...
‚ö° NO DELAYS | PARALLEL PROCESSING | NO DUPLICATES

üöÄ STARTING OPTIMIZED FULL DOWNLOAD...
üìÅ Created composite folder: street_view_composites_FULL

üìä DOWNLOAD STATUS:
   Total locations: 6,204
   Already completed: 0
   Remaining: 6,204

üîÑ PROCESSING 6,204 LOCATIONS WITH 8 PARALLEL WORKERS...

üì¶ BATCH 1/125 (50 locations)
   ‚úÖ Processed: 10/6,204 (0.2%) | Rate: 24.0/sec | ETA: 4.3min
   ‚úÖ Processed: 20/6,204 (0.3%) | Rate: 29.9/sec | ETA: 3.4min
   ‚úÖ Processed: 30/6,204 (0.5%) | Rate: 32.9/sec | ETA: 3.1min
   ‚úÖ Processed: 40/6,204 (0.6%) | Rate: 33.7/sec | ETA: 3.0min
   ‚úÖ Processed: 50/6,204 (0.8%) | Rat

In [8]:
# ==============================================================================
# 12. ANALYZE MISSING IMAGERY PERCENTAGE FROM EXISTING DOWNLOADS
# ==============================================================================

print(f"\nüìä ANALYZING MISSING IMAGERY FROM EXISTING DOWNLOADS...")
print("=" * 70)
print("üí° ANALYSIS STRATEGY:")
print("   ‚Ä¢ Check existing composite images")
print("   ‚Ä¢ Detect 'no imagery' placeholders")
print("   ‚Ä¢ Calculate missing imagery percentage")
print("   ‚Ä¢ Analyze patterns by location/crime density")
print("=" * 70)

import pandas as pd
import json
import os
from PIL import Image
import numpy as np

# ==============================================================================
# 12A. LOAD EXISTING DATA
# ==============================================================================

print(f"\nüìÇ LOADING EXISTING DATA...")

try:
    # Load progress data
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            progress_data = json.load(f)
        print(f"‚úÖ Loaded progress: {len(progress_data.get('completed', [])):,} completed downloads")
    else:
        print(f"‚ùå No progress file found")
        progress_data = {'completed': [], 'statistics': {}}
    
    # Load image mapping
    image_mapping = pd.read_csv('image_property_mapping_MASTER.csv')
    print(f"‚úÖ Loaded image mapping: {len(image_mapping):,} records")
    
    # Get unique locations that were attempted
    completed_svids = set(progress_data.get('completed', []))
    print(f"‚úÖ Found {len(completed_svids):,} completed SVI locations")
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    raise

# ==============================================================================
# 12B. IMPROVED "NO IMAGERY" DETECTION FOR GOOGLE STREET VIEW
# ==============================================================================

def detect_no_imagery_composite_improved(image_path):
    """
    Enhanced detection for Google Street View 'no imagery' placeholders
    Returns: dict with analysis results
    """
    try:
        img = Image.open(image_path)
        img_array = np.array(img)
        
        # Calculate color variance for each quadrant
        height, width = img_array.shape[:2]
        mid_h, mid_w = height // 2, width // 2
        
        quadrants = {
            'north': img_array[0:mid_h, 0:mid_w],
            'east': img_array[0:mid_h, mid_w:width],
            'south': img_array[mid_h:height, 0:mid_w],
            'west': img_array[mid_h:height, mid_w:width]
        }
        
        missing_directions = []
        total_directions = 0
        
        for direction, quadrant in quadrants.items():
            total_directions += 1
            
            # Multiple detection methods for "Sorry, we have no imagery here"
            is_no_imagery = False
            
            if len(quadrant.shape) == 3:
                # Method 1: Check for light gray/beige background (Google's typical placeholder)
                mean_vals = np.mean(quadrant.reshape(-1, quadrant.shape[2]), axis=0)
                
                # Google's "no imagery" typically has light gray/beige background (~200-220 RGB)
                is_light_background = np.all(mean_vals > 180) and np.all(mean_vals < 230)
                
                # Method 2: Low color variance (uniform background)
                std_dev = np.std(quadrant.reshape(-1, quadrant.shape[2]), axis=0)
                avg_std = np.mean(std_dev)
                is_low_variance = avg_std < 25
                
                # Method 3: Check for text-like patterns (dark pixels on light background)
                # Convert to grayscale for text detection
                gray_quad = np.mean(quadrant, axis=2)
                dark_pixels = np.sum(gray_quad < 100)  # Count dark pixels (text)
                total_pixels = gray_quad.size
                dark_pixel_ratio = dark_pixels / total_pixels
                
                # "Sorry, we have no imagery here" has specific text patterns
                has_text_pattern = 0.01 < dark_pixel_ratio < 0.15  # 1-15% dark pixels for text
                
                # Method 4: Check for very uniform color distribution
                color_hist = np.histogram(quadrant.flatten(), bins=50)[0]
                # If most pixels are in a narrow range, it's likely a placeholder
                max_bin_ratio = np.max(color_hist) / np.sum(color_hist)
                is_uniform = max_bin_ratio > 0.3  # 30% of pixels in one color range
                
                # Combine detection methods
                if (is_light_background and is_low_variance) or \
                   (is_light_background and has_text_pattern) or \
                   (is_uniform and is_low_variance):
                    is_no_imagery = True
                    
            else:
                # Grayscale image - simpler detection
                std_dev = np.std(quadrant)
                mean_val = np.mean(quadrant)
                
                # Light gray with low variance
                if mean_val > 180 and std_dev < 15:
                    is_no_imagery = True
            
            if is_no_imagery:
                missing_directions.append(direction)
        
        return {
            'total_directions': total_directions,
            'missing_directions': missing_directions,
            'missing_count': len(missing_directions),
            'missing_percentage': len(missing_directions) / total_directions * 100,
            'has_partial_imagery': len(missing_directions) < total_directions and len(missing_directions) > 0,
            'has_complete_imagery': len(missing_directions) == 0,
            'completely_missing': len(missing_directions) == total_directions
        }
        
    except Exception as e:
        return {
            'error': str(e),
            'total_directions': 4,
            'missing_count': 4,  # Assume worst case
            'missing_percentage': 100
        }

print(f"\nüîç RE-ANALYZING WITH IMPROVED 'NO IMAGERY' DETECTION...")

# Re-analyze with improved detection
analysis_results_improved = []
analyzed_count = 0

for svi_id in completed_svids:
    composite_path = os.path.join(COMPOSITE_FOLDER, f"composite_{svi_id}.jpg")
    
    if os.path.exists(composite_path):
        analysis = detect_no_imagery_composite_improved(composite_path)
        analysis['svi_point_id'] = svi_id
        analysis['composite_exists'] = True
        analysis_results_improved.append(analysis)
        analyzed_count += 1
    else:
        analysis_results_improved.append({
            'svi_point_id': svi_id,
            'composite_exists': False,
            'missing_count': 4,
            'missing_percentage': 100,
            'error': 'Composite file not found'
        })
    
    if analyzed_count % 1000 == 0:
        print(f"   Re-analyzed {analyzed_count:,} composites...")

print(f"‚úÖ Improved analysis complete: {analyzed_count:,} composites analyzed")
# ==============================================================================
# 12C. CALCULATE MISSING IMAGERY STATISTICS
# ==============================================================================

print(f"\nüìä MISSING IMAGERY STATISTICS:")

analysis_df = pd.DataFrame(analysis_results_improved)

# Filter out error cases for main statistics
valid_analysis = analysis_df[analysis_df['composite_exists'] == True].copy()

if len(valid_analysis) > 0:
    # Overall statistics
    total_composites = len(valid_analysis)
    total_directions_analyzed = total_composites * 4
    total_missing_directions = valid_analysis['missing_count'].sum()
    
    overall_missing_percentage = (total_missing_directions / total_directions_analyzed) * 100
    
    print(f"   üì∏ Total composites analyzed: {total_composites:,}")
    print(f"   üéØ Total directions analyzed: {total_directions_analyzed:,}")
    print(f"   ‚ùå Total missing directions: {total_missing_directions:,}")
    print(f"   üìâ Overall missing imagery: {overall_missing_percentage:.2f}%")
    print(f"   üìà Overall imagery success: {100-overall_missing_percentage:.2f}%")
    
    # Categorize composites by missing imagery
    complete_imagery = (valid_analysis['missing_count'] == 0).sum()
    partial_imagery = ((valid_analysis['missing_count'] > 0) & (valid_analysis['missing_count'] < 4)).sum()
    no_imagery = (valid_analysis['missing_count'] == 4).sum()
    
    print(f"\nüéØ COMPOSITE CATEGORIES:")
    print(f"   ‚úÖ Complete imagery (4/4): {complete_imagery:,} ({complete_imagery/total_composites*100:.1f}%)")
    print(f"   üü° Partial imagery (1-3/4): {partial_imagery:,} ({partial_imagery/total_composites*100:.1f}%)")
    print(f"   ‚ùå No imagery (0/4): {no_imagery:,} ({no_imagery/total_composites*100:.1f}%)")
    
    # Distribution of missing directions
    missing_dist = valid_analysis['missing_count'].value_counts().sort_index()
    print(f"\nüìä MISSING DIRECTIONS DISTRIBUTION:")
    for missing_count, frequency in missing_dist.items():
        percentage = frequency / total_composites * 100
        print(f"   {missing_count}/4 missing: {frequency:,} composites ({percentage:.1f}%)")

# ==============================================================================
# 12D. ANALYZE BY CRIME DENSITY AND LOCATION
# ==============================================================================

print(f"\nüó∫Ô∏è MISSING IMAGERY BY LOCATION CHARACTERISTICS:")

# Merge with image mapping to get location characteristics
if len(valid_analysis) > 0:
    # Get location data for analyzed composites
    location_data = image_mapping.groupby('svi_point_id').agg({
        'svi_latitude': 'first',
        'svi_longitude': 'first',
        'svi_crime_density': 'first',
        'cluster_size': 'first'
    }).reset_index()
    
    # Merge with analysis results
    detailed_analysis = valid_analysis.merge(location_data, on='svi_point_id', how='left')
    
    if 'svi_crime_density' in detailed_analysis.columns:
        # Analyze by crime density
        crime_bins = pd.qcut(detailed_analysis['svi_crime_density'].fillna(0), 
                           q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
        detailed_analysis['crime_category'] = crime_bins
        
        crime_analysis = detailed_analysis.groupby('crime_category').agg({
            'missing_count': ['mean', 'count'],
            'missing_percentage': 'mean'
        }).round(2)
        
        print(f"   üìà MISSING IMAGERY BY CRIME DENSITY:")
        for category in ['Low', 'Medium-Low', 'Medium-High', 'High']:
            if category in crime_analysis.index:
                avg_missing = crime_analysis.loc[category, ('missing_percentage', 'mean')]
                count = crime_analysis.loc[category, ('missing_count', 'count')]
                print(f"   ‚Ä¢ {category} crime: {avg_missing:.1f}% missing imagery ({count} locations)")
    
    if 'cluster_size' in detailed_analysis.columns:
        # Analyze by cluster size (properties served)
        cluster_bins = pd.cut(detailed_analysis['cluster_size'].fillna(1), 
                            bins=[0, 5, 10, 25, float('inf')], 
                            labels=['1-5 props', '6-10 props', '11-25 props', '25+ props'])
        detailed_analysis['cluster_category'] = cluster_bins
        
        cluster_analysis = detailed_analysis.groupby('cluster_category').agg({
            'missing_percentage': 'mean',
            'svi_point_id': 'count'
        }).round(2)
        
        print(f"\nüè† MISSING IMAGERY BY PROPERTIES SERVED:")
        for category in cluster_analysis.index:
            avg_missing = cluster_analysis.loc[category, 'missing_percentage']
            count = cluster_analysis.loc[category, 'svi_point_id']
            print(f"   ‚Ä¢ {category}: {avg_missing:.1f}% missing imagery ({count} locations)")

# ==============================================================================
# 12E. CREATE DETAILED MISSING IMAGERY REPORT
# ==============================================================================

print(f"\nüíæ CREATING DETAILED MISSING IMAGERY REPORT...")

if len(analysis_results_improved) > 0:
    # Save detailed analysis
    analysis_df.to_csv('missing_imagery_analysis.csv', index=False)
    
    # Create summary report
    summary_stats = {
        'total_composites_analyzed': len(valid_analysis),
        'total_directions_analyzed': len(valid_analysis) * 4,
        'total_missing_directions': valid_analysis['missing_count'].sum() if len(valid_analysis) > 0 else 0,
        'overall_missing_percentage': overall_missing_percentage if len(valid_analysis) > 0 else 0,
        'complete_imagery_count': complete_imagery if len(valid_analysis) > 0 else 0,
        'partial_imagery_count': partial_imagery if len(valid_analysis) > 0 else 0,
        'no_imagery_count': no_imagery if len(valid_analysis) > 0 else 0,
        'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    
    summary_df = pd.DataFrame([summary_stats])
    summary_df.to_csv('missing_imagery_summary.csv', index=False)
    
    print(f"‚úÖ Reports saved:")
    print(f"   ‚Ä¢ missing_imagery_analysis.csv - Detailed per-composite analysis")
    print(f"   ‚Ä¢ missing_imagery_summary.csv - Summary statistics")

# ==============================================================================
# 12F. FINAL MISSING IMAGERY SUMMARY
# ==============================================================================

print(f"\nüéØ FINAL MISSING IMAGERY SUMMARY:")
print("=" * 70)

if len(valid_analysis) > 0:
    print(f"üìä MISSING IMAGERY ANALYSIS RESULTS:")
    print(f"   üñºÔ∏è Composites analyzed: {len(valid_analysis):,}")
    print(f"   üìâ Overall missing imagery: {overall_missing_percentage:.2f}%")
    print(f"   üìà Overall imagery success: {100-overall_missing_percentage:.2f}%")
    print(f"   ‚úÖ Complete imagery: {complete_imagery:,} composites ({complete_imagery/len(valid_analysis)*100:.1f}%)")
    print(f"   üü° Partial imagery: {partial_imagery:,} composites ({partial_imagery/len(valid_analysis)*100:.1f}%)")
    print(f"   ‚ùå No imagery: {no_imagery:,} composites ({no_imagery/len(valid_analysis)*100:.1f}%)")
else:
    print("‚ùå No valid composites found for analysis")

print("=" * 70)
print(f"üéâ MISSING IMAGERY ANALYSIS COMPLETE!")


üìä ANALYZING MISSING IMAGERY FROM EXISTING DOWNLOADS...
üí° ANALYSIS STRATEGY:
   ‚Ä¢ Check existing composite images
   ‚Ä¢ Detect 'no imagery' placeholders
   ‚Ä¢ Calculate missing imagery percentage
   ‚Ä¢ Analyze patterns by location/crime density

üìÇ LOADING EXISTING DATA...
‚úÖ Loaded progress: 6,204 completed downloads
‚úÖ Loaded image mapping: 167,340 records
‚úÖ Found 6,204 completed SVI locations

üîç RE-ANALYZING WITH IMPROVED 'NO IMAGERY' DETECTION...
   Re-analyzed 1,000 composites...
   Re-analyzed 2,000 composites...
   Re-analyzed 3,000 composites...
   Re-analyzed 4,000 composites...
   Re-analyzed 5,000 composites...
   Re-analyzed 6,000 composites...
‚úÖ Improved analysis complete: 6,204 composites analyzed

üìä MISSING IMAGERY STATISTICS:
   üì∏ Total composites analyzed: 6,204
   üéØ Total directions analyzed: 24,816
   ‚ùå Total missing directions: 603
   üìâ Overall missing imagery: 2.43%
   üìà Overall imagery success: 97.57%

üéØ COMPOSITE CATEGORI

  crime_analysis = detailed_analysis.groupby('crime_category').agg({
  cluster_analysis = detailed_analysis.groupby('cluster_category').agg({
