In [None]:
import subprocess
import os
import shutil
from pathlib import Path

def clear_conda_cache():
    """Comprehensive Conda cache cleanup"""
    
    print("=" * 60)
    print("üßπ CONDA CACHE CLEANUP")
    print("=" * 60)
    
    # 1. Run conda clean --all
    print("\n1Ô∏è‚É£ Running 'conda clean --all'...")
    try:
        result = subprocess.run(['conda', 'clean', '--all', '-y'], 
                              capture_output=True, text=True)
        print(result.stdout)
        print("‚úÖ Conda clean completed")
    except Exception as e:
        print(f"‚ö†Ô∏è Error running conda clean: {e}")
    
    # 2. Find and show conda directories
    print("\n2Ô∏è‚É£ Locating Conda directories...")
    
    try:
        # Get conda info
        result = subprocess.run(['conda', 'info', '--base'], 
                              capture_output=True, text=True)
        conda_base = result.stdout.strip()
        print(f"   Conda base: {conda_base}")
        
        # Common cache locations
        cache_locations = [
            os.path.join(conda_base, 'pkgs'),
            os.path.join(conda_base, 'conda-meta'),
            os.path.expanduser('~/.conda/pkgs'),
            os.path.expanduser('~/anaconda3/pkgs'),
            os.path.expanduser('~/miniconda3/pkgs'),
        ]
        
        print("\n3Ô∏è‚É£ Cache locations:")
        for location in cache_locations:
            if os.path.exists(location):
                size = sum(f.stat().st_size for f in Path(location).rglob('*') if f.is_file())
                size_mb = size / (1024**2)
                print(f"   üìÅ {location} ({size_mb:.2f} MB)")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error: {e}")
    
    # 3. Clear pip cache
    print("\n4Ô∏è‚É£ Clearing pip cache...")
    try:
        subprocess.run(['pip', 'cache', 'purge'], capture_output=True)
        print("‚úÖ Pip cache cleared")
    except Exception as e:
        print(f"‚ö†Ô∏è Error: {e}")
    
    print("\n" + "=" * 60)
    print("‚úÖ CLEANUP COMPLETE!")
    print("=" * 60)

if __name__ == "__main__":
    clear_conda_cache()


In [None]:
def execute_phase2(input_folder=None, input_file=None, output_folder="googlebot_final_only"):
    """
    Phase 2: Log file verification and processing
    IN-MEMORY 3-PHASE GOOGLEBOT EXTRACTION
    Only outputs final Phase 3 files with search-API filtering
    
    Parameters:
    -----------
    input_folder : str, optional
        Path to folder containing .log or .log.gz files
    input_file : str, optional
        Path to single .log or .log.gz file
    output_folder : str
        Directory where output CSV files will be saved
    
    Returns:
    --------
    dict
        {
            'status': 'success' or 'error',
            'output_folder': absolute path to output folder,
            'phase3_files': list of output CSV files,
            'final_records': total records after filtering,
            'search_api_filtered': count of search-API URLs filtered,
            'match_rate': percentage match with GSC,
            'rejection_stats': breakdown of verification stats
        }
    """
    
    import os
    import json
    import csv
    import gzip
    import socket
    import time
    import zipfile
    import ipaddress
    import requests
    import dns.resolver
    import dns.reversename
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from pathlib import Path
    from tqdm import tqdm
    from datetime import datetime
    
    # ============================================================================
    # CONFIGURATION
    # ============================================================================
    
    MAX_DNS_WORKERS = 150
    DNS_TIMEOUT = 2
    IP_RANGES_CACHE_FILE = "google_ip_ranges_cache.json"
    CACHE_EXPIRY_HOURS = 24
    GOOGLE_DOMAINS = ['.googlebot.com', '.google.com', '.googleusercontent.com']
    MAX_RECORDS_PER_FILE = 500000
    
    # NEW: Lenient mode and FcrDNS fallback
    LENIENT_MODE = True  # Accept records with UA but missing/invalid IP
    ENABLE_FCRDNS_FALLBACK = True  # Verify non-CIDR IPs with FcrDNS
    
    GOOGLE_IP_RANGE_URLS = {
        'googlebot': 'https://developers.google.com/static/search/apis/ipranges/googlebot.json',
        'special_crawlers': 'https://developers.google.com/static/search/apis/ipranges/special-crawlers.json',
        'user_triggered_fetchers': 'https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json',
        'user_triggered_fetchers_google': 'https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers-google.json',
    }
    
    # ============================================================================
    # FETCH DYNAMIC IP RANGES
    # ============================================================================
    
    def fetch_google_ip_ranges():
        """Fetch Google's official IP ranges (IPv4 + IPv6) from JSON files"""
        print("\nüåê Fetching Google IP ranges from official sources...")
        
        all_prefixes = []
        
        for source_name, url in GOOGLE_IP_RANGE_URLS.items():
            try:
                print(f"   ‚Ä¢ Fetching {source_name}...")
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                
                data = response.json()
                
                if 'prefixes' in data:
                    for prefix in data['prefixes']:
                        if 'ipv4Prefix' in prefix:
                            all_prefixes.append(prefix['ipv4Prefix'])
                        if 'ipv6Prefix' in prefix:
                            all_prefixes.append(prefix['ipv6Prefix'])
                
                print(f"     ‚úÖ Retrieved {len(data.get('prefixes', []))} prefixes")
            
            except Exception as e:
                print(f"     ‚ö†Ô∏è Warning: Failed to fetch {source_name}: {str(e)}")
        
        ip_networks = []
        ipv4_count = 0
        ipv6_count = 0
        
        for prefix in all_prefixes:
            try:
                network = ipaddress.ip_network(prefix)
                ip_networks.append(network)
                
                if network.version == 4:
                    ipv4_count += 1
                else:
                    ipv6_count += 1
            except ValueError:
                continue
        
        print(f"\n‚úÖ Total IP ranges loaded: {len(ip_networks)} CIDR blocks")
        print(f"   ‚Ä¢ IPv4 ranges: {ipv4_count}")
        print(f"   ‚Ä¢ IPv6 ranges: {ipv6_count}")
        return ip_networks
    
    def load_or_fetch_ip_ranges(cache_file=IP_RANGES_CACHE_FILE):
        """Load IP ranges from cache or fetch from Google if expired"""
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'r') as f:
                    cache_data = json.load(f)
                
                cache_time = cache_data.get('timestamp', 0)
                cache_age_hours = (time.time() - cache_time) / 3600
                
                if cache_age_hours < CACHE_EXPIRY_HOURS:
                    print(f"\n‚úÖ Using cached IP ranges (age: {cache_age_hours:.1f} hours)")
                    ip_networks = [ipaddress.ip_network(cidr) for cidr in cache_data['prefixes']]
                    
                    ipv4_count = sum(1 for n in ip_networks if n.version == 4)
                    ipv6_count = sum(1 for n in ip_networks if n.version == 6)
                    print(f"   ‚Ä¢ IPv4 ranges: {ipv4_count}")
                    print(f"   ‚Ä¢ IPv6 ranges: {ipv6_count}")
                    return ip_networks
                else:
                    print(f"\n‚è∞ Cache expired, fetching fresh data...")
            except Exception as e:
                print(f"\n‚ö†Ô∏è Cache error, fetching fresh data...")
        
        ip_networks = fetch_google_ip_ranges()
        
        try:
            cache_data = {
                'timestamp': time.time(),
                'prefixes': [str(network) for network in ip_networks]
            }
            with open(cache_file, 'w') as f:
                json.dump(cache_data, f, indent=2)
            print(f"üíæ IP ranges cached")
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to save cache")
        
        return ip_networks
    
    # ============================================================================
    # UTILITY FUNCTIONS
    # ============================================================================
    
    def extract_full_timestamp_from_filename(filename):
        """Extract unique timestamp from filename"""
        base_name = filename
        if base_name.endswith('.gz'):
            base_name = base_name[:-3]
        if base_name.endswith('.log'):
            base_name = base_name[:-4]
        if base_name.startswith('nginx-'):
            base_name = base_name[6:]
        return base_name if base_name else None
    
    def extract_first_ip(ip_string):
        """Extract first IP (IPv4 or IPv6) from X-Forwarded-For field"""
        if not ip_string:
            return None
        
        ip = str(ip_string).split(',')[0].strip()
        
        if ip.startswith('[') and ip.endswith(']'):
            ip = ip[1:-1]
        
        return ip if ip else None
    
    def is_googlebot(user_agent_str):
        """UA validation - Check if contains googlebot patterns"""
        if not user_agent_str:
            return False
        
        try:
            ua_lower = str(user_agent_str).lower()
        except:
            return False
        
        googlebot_patterns = [
            'googlebot', 'google-inspectiontool', 'googlebot-image', 'googlebot-news',
            'googlebot-video', 'adsbot-google', 'mediapartners-google', 'apis-google',
            'google favicon', 'feedfetcher-google', 'google-read-aloud', 'duplichecker',
            'google web preview', 'google-site-verification', 'google-smartphone'
        ]
        
        for pattern in googlebot_patterns:
            if pattern in ua_lower:
                return True
        return False
    
    def is_search_api_url(url):
        """Check if URL contains search-api pattern"""
        if not url:
            return False
        return '/search-api/v1/search/' in str(url).lower()
    
    def create_zip_archive(file_list, output_folder, archive_name="googlebot_data"):
        """Create ZIP archive"""
        zip_filename = os.path.join(output_folder, f"{archive_name}_{time.strftime('%Y%m%d_%H%M%S')}.zip")
        
        print(f"\nüì¶ Creating ZIP archive...")
        with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for file_path in tqdm(file_list, desc="Archiving", unit=" files"):
                zipf.write(file_path, arcname=os.path.basename(file_path))
        
        zip_size_mb = os.path.getsize(zip_filename) / (1024**2)
        print(f"‚úÖ ZIP created: {os.path.basename(zip_filename)} ({zip_size_mb:.2f} MB)")
        return zip_filename
    
    # ============================================================================
    # VERIFICATION METHODS
    # ============================================================================
    
    def verify_ip_in_range(ip, ip_networks):
        """Check if IP is in Google's CIDR ranges"""
        try:
            ip_addr = ipaddress.ip_address(ip)
            for network in ip_networks:
                if ip_addr in network:
                    return True
            return False
        except ValueError:
            return False
    
    def verify_ip_method2_dns_fast(ip):
        """FcrDNS Verification - ULTRA FAST using dnspython"""
        try:
            rev_name = dns.reversename.from_address(ip)
            
            resolver = dns.resolver.Resolver()
            resolver.timeout = DNS_TIMEOUT
            resolver.lifetime = DNS_TIMEOUT
            
            try:
                reverse_answers = resolver.resolve(rev_name, 'PTR')
                hostname = str(reverse_answers[0]).rstrip('.')
                hostname_lower = hostname.lower()
            except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer, dns.resolver.Timeout, dns.exception.DNSException):
                return False
            
            is_google_domain = any(hostname_lower.endswith(domain) for domain in GOOGLE_DOMAINS)
            
            if not is_google_domain:
                return False
            
            try:
                forward_ips = []
                
                try:
                    forward_answers = resolver.resolve(hostname, 'A')
                    forward_ips.extend([str(rdata) for rdata in forward_answers])
                except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
                    pass
                
                try:
                    forward_answers = resolver.resolve(hostname, 'AAAA')
                    forward_ips.extend([str(rdata) for rdata in forward_answers])
                except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
                    pass
                
                if not forward_ips:
                    return False
                
                if ip in forward_ips:
                    return True
                else:
                    return False
            
            except (dns.resolver.Timeout, dns.exception.DNSException):
                return False
        
        except Exception:
            return False
    
    def verify_ips_parallel_dns(ips_to_verify, desc="DNS Verification"):
        """Verify IPs using FcrDNS in parallel"""
        verified_ips = {}
        
        with ThreadPoolExecutor(max_workers=MAX_DNS_WORKERS) as executor:
            future_to_ip = {
                executor.submit(verify_ip_method2_dns_fast, ip): ip 
                for ip in ips_to_verify
            }
            
            with tqdm(total=len(ips_to_verify), desc=desc, unit=" IPs") as pbar:
                for future in as_completed(future_to_ip):
                    ip = future_to_ip[future]
                    try:
                        is_verified = future.result(timeout=DNS_TIMEOUT + 1)
                        verified_ips[ip] = is_verified
                    except Exception:
                        verified_ips[ip] = False
                    finally:
                        pbar.update(1)
        
        return verified_ips
    
    # ============================================================================
    # MAIN PROCESSING
    # ============================================================================
    
    print("=" * 80)
    print("üéØ IN-MEMORY 3-PHASE GOOGLEBOT EXTRACTION")
    print("=" * 80)
    print(f"   Lenient Mode: {'ENABLED' if LENIENT_MODE else 'DISABLED'}")
    print(f"   FcrDNS Fallback: {'ENABLED' if ENABLE_FCRDNS_FALLBACK else 'DISABLED'}")
    print(f"   Search-API Filter: ENABLED")
    
    # Load IP ranges
    print(f"\n{'=' * 80}")
    print(f"LOADING GOOGLE IP RANGES")
    print(f"{'=' * 80}")
    
    try:
        google_ip_networks = load_or_fetch_ip_ranges()
        print(f"‚úÖ Ready with {len(google_ip_networks)} official IP ranges")
    except Exception as e:
        print(f"‚ùå Failed to load IP ranges: {str(e)}")
        return {
            'status': 'error',
            'error': f'Failed to load IP ranges: {str(e)}'
        }
    
    # Get input files
    log_files = []
    if input_file:
        if os.path.exists(input_file):
            log_files = [input_file]
        else:
            print(f"‚ùå File not found: {input_file}")
            return {
                'status': 'error',
                'error': f'File not found: {input_file}'
            }
    elif input_folder:
        if os.path.exists(input_folder):
            log_files = sorted([os.path.join(input_folder, f) for f in os.listdir(input_folder) 
                        if f.endswith('.log') or f.endswith('.log.gz')])
            if not log_files:
                print(f"‚ùå No .log files found in: {input_folder}")
                return {
                    'status': 'error',
                    'error': f'No .log files found in: {input_folder}'
                }
        else:
            print(f"‚ùå Folder not found: {input_folder}")
            return {
                'status': 'error',
                'error': f'Folder not found: {input_folder}'
            }
    else:
        print("‚ùå Please provide input_folder or input_file")
        return {
            'status': 'error',
            'error': 'Please provide input_folder or input_file'
        }
    
    os.makedirs(output_folder, exist_ok=True)
    abs_output_folder = os.path.abspath(output_folder)
    
    print(f"\nüìÅ Input files: {len(log_files)}")
    
    print(f"\nüîç IN-MEMORY 3-PHASE STRATEGY:")
    print(f"   Phase 1: Filter by Googlebot UA ‚Üí Store in memory")
    print(f"   Phase 2: Check IPs in CIDR ‚Üí Store in memory")
    print(f"   Phase 3: FcrDNS verification ‚Üí Write final output (no intermediate files)")
    print(f"   Phase 4: Search-API filter ‚Üí Final clean dataset")
    
    # ========================================================================
    # PHASE 1: IN-MEMORY UA FILTERING
    # ========================================================================
    print(f"\n{'=' * 80}")
    print(f"PHASE 1: FILTER BY GOOGLEBOT UA (IN-MEMORY)")
    print(f"{'=' * 80}")
    
    googlebot_records = []
    unique_ips_phase1 = set()
    googlebot_variants = {}
    
    stats_phase1 = {
        'total_processed': 0,
        'has_googlebot_ua': 0,
    }
    
    start_phase1 = time.time()
    
    for file_idx, log_file in enumerate(log_files, 1):
        filename = os.path.basename(log_file)
        file_size_mb = os.path.getsize(log_file) / (1024**2)
        print(f"\nüìÇ [{file_idx}/{len(log_files)}] {filename} ({file_size_mb:.1f} MB)")
        
        file_accepted = 0
        
        try:
            if log_file.endswith('.gz'):
                file_handle = gzip.open(log_file, 'rt', encoding='utf-8', errors='ignore')
            else:
                file_handle = open(log_file, 'r', encoding='utf-8', errors='ignore')
            
            with file_handle as log_reader:
                pbar = tqdm(log_reader, desc="   Filtering UA", unit=" recs", ncols=100, mininterval=0.5)
                
                for line in pbar:
                    try:
                        if not line.strip():
                            continue
                        
                        log_entry = json.loads(line.strip())
                        
                        stats_phase1['total_processed'] += 1
                        
                        user_agent = log_entry.get('http_user_agent', '')
                        if user_agent:
                            user_agent = str(user_agent).strip()
                        
                        if not is_googlebot(user_agent):
                            continue
                        
                        stats_phase1['has_googlebot_ua'] += 1
                        
                        # Track variant
                        ua_lower = user_agent.lower()
                        for variant in ['googlebot-image', 'googlebot-news', 'googlebot-video', 
                                       'google-inspectiontool', 'adsbot-google', 'mediapartners-google']:
                            if variant in ua_lower:
                                key = variant.replace('-', ' ').title().replace(' ', '-')
                                googlebot_variants[key] = googlebot_variants.get(key, 0) + 1
                                break
                        else:
                            if 'googlebot' in ua_lower:
                                googlebot_variants['Googlebot (standard)'] = googlebot_variants.get('Googlebot (standard)', 0) + 1
                        
                        # Extract all required fields
                        ip = extract_first_ip(log_entry.get('http_x_forwarded_for', ''))
                        if ip:
                            unique_ips_phase1.add(ip)
                        
                        # Store record with all fields
                        record = {
                            'time_iso8601': str(log_entry.get('time_iso8601', '')).strip(),
                            'request_uri': str(log_entry.get('request_uri', '')).strip(),
                            'status': str(log_entry.get('status', '')).strip(),
                            'http_user_agent': user_agent,
                            'http_x_forwarded_for': ip if ip else '',
                            'geoip_country_code': str(log_entry.get('geoip_country_code', '')).strip(),
                            'upstream_response_time': str(log_entry.get('upstream_response_time', '')).strip(),
                            'bytes_sent': str(log_entry.get('bytes_sent', log_entry.get('body_bytes_sent', ''))).strip(),
                            'source_file': filename
                        }
                        
                        googlebot_records.append(record)
                        file_accepted += 1
                    
                    except Exception:
                        continue
                
                pbar.close()
            
            print(f"   ‚úÖ Accepted: {file_accepted:,} records with Googlebot UA")
        
        except Exception as e:
            print(f"   ‚ùå Error: {str(e)}")
            continue
    
    elapsed_phase1 = time.time() - start_phase1
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ PHASE 1 COMPLETE")
    print(f"{'=' * 80}")
    print(f"   Total processed: {stats_phase1['total_processed']:,}")
    print(f"   Googlebot UA filtered: {len(googlebot_records):,}")
    print(f"   Unique IPs: {len(unique_ips_phase1):,}")
    print(f"   Time: {elapsed_phase1:.1f}s ({elapsed_phase1/60:.1f} min)")
    
    # ========================================================================
    # PHASE 2: CIDR CHECK (IN-MEMORY)
    # ========================================================================
    print(f"\n{'=' * 80}")
    print(f"PHASE 2: CHECK IPS AGAINST CIDR RANGES (IN-MEMORY)")
    print(f"{'=' * 80}")
    
    print(f"\nüîç Checking {len(unique_ips_phase1):,} IPs against CIDR ranges...")
    verified_ips_cidr = {}
    for ip in tqdm(unique_ips_phase1, desc="Checking IPs", unit=" IPs"):
        verified_ips_cidr[ip] = verify_ip_in_range(ip, google_ip_networks)
    
    ips_in_range = sum(1 for v in verified_ips_cidr.values() if v)
    unique_ips_outside_cidr = {ip for ip, in_range in verified_ips_cidr.items() if not in_range}
    
    print(f"   ‚úÖ IPs in CIDR ranges: {ips_in_range:,}/{len(unique_ips_phase1):,}")
    print(f"   ‚ö†Ô∏è IPs outside ranges: {len(unique_ips_outside_cidr):,} (will check with FcrDNS in Phase 3)")
    
    elapsed_phase2 = time.time() - start_phase1 - elapsed_phase1
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ PHASE 2 COMPLETE")
    print(f"{'=' * 80}")
    print(f"   Time: {elapsed_phase2:.1f}s ({elapsed_phase2/60:.1f} min)")
    
    # ========================================================================
    # PHASE 3: FcrDNS + WRITE FINAL OUTPUT
    # ========================================================================
    print(f"\n{'=' * 80}")
    print(f"PHASE 3: FcrDNS VERIFICATION + WRITE FINAL OUTPUT")
    print(f"{'=' * 80}")
    
    dns_verified_ips = {}
    
    if ENABLE_FCRDNS_FALLBACK and len(unique_ips_outside_cidr) > 0:
        print(f"\nüåê Starting FcrDNS verification for {len(unique_ips_outside_cidr):,} IPs outside CIDR ranges...")
        print(f"   Using {MAX_DNS_WORKERS} workers")
        
        start_dns = time.time()
        
        dns_verified_ips = verify_ips_parallel_dns(unique_ips_outside_cidr, desc="FcrDNS Verification")
        
        dns_pass = sum(1 for v in dns_verified_ips.values() if v)
        elapsed_dns = time.time() - start_dns
        
        print(f"\n‚úÖ FcrDNS Verification Results:")
        print(f"   Passed: {dns_pass:,}/{len(unique_ips_outside_cidr):,}")
        print(f"   Time: {elapsed_dns:.1f}s ({elapsed_dns/60:.1f} min)")
        if len(unique_ips_outside_cidr) > 0:
            print(f"   Speed: {len(unique_ips_outside_cidr)/elapsed_dns:.0f} IPs/sec")
    
    print(f"\nüìù Writing final verified records (with search-API filtering)...")
    
    phase3_files = []
    total_phase3_before_filter = 0
    total_phase3_after_filter = 0
    search_api_filtered = 0
    
    rejection_stats = {
        'in_cidr': 0,
        'missing_ip_accepted': 0,
        'invalid_ip_accepted': 0,
        'fcrdns_passed': 0,
        'fcrdns_failed_rejected': 0
    }
    
    # Group records by source file
    records_by_file = {}
    for record in googlebot_records:
        source_file = record['source_file']
        if source_file not in records_by_file:
            records_by_file[source_file] = []
        records_by_file[source_file].append(record)
    
    start_write = time.time()
    
    for source_file, records in tqdm(sorted(records_by_file.items()), desc="Writing files", unit=" files"):
        file_timestamp = extract_full_timestamp_from_filename(source_file)
        
        if file_timestamp:
            base_filename = f"googlebot_{file_timestamp}"
        else:
            base_filename = f"googlebot_file"
        
        current_file_index = 1
        current_file_records = 0
        current_writer = None
        current_csvfile = None
        
        def create_output_file(file_index):
            if file_index == 1:
                csv_filename = os.path.join(output_folder, f"{base_filename}.csv")
            else:
                csv_filename = os.path.join(output_folder, f"{base_filename}_part{file_index}.csv")
            
            csvfile = open(csv_filename, 'w', newline='', encoding='utf-8-sig')
            writer = csv.DictWriter(csvfile, fieldnames=[
                'time_iso8601', 'request_uri', 'status', 'http_user_agent',
                'http_x_forwarded_for', 'geoip_country_code', 
                'upstream_response_time', 'bytes_sent'
            ], quoting=csv.QUOTE_ALL)
            writer.writeheader()
            phase3_files.append(csv_filename)
            return csvfile, writer
        
        current_csvfile, current_writer = create_output_file(current_file_index)
        
        for record in records:
            ip = record['http_x_forwarded_for'].strip()
            
            # Determine if record should be accepted
            accept_record = False
            
            if not ip or ip == '' or ip.lower() in ['none', 'null', '-', 'unknown']:
                if LENIENT_MODE:
                    accept_record = True
                    rejection_stats['missing_ip_accepted'] += 1
            elif ip in verified_ips_cidr:
                if verified_ips_cidr[ip]:
                    accept_record = True
                    rejection_stats['in_cidr'] += 1
                else:
                    if dns_verified_ips.get(ip, False):
                        accept_record = True
                        rejection_stats['fcrdns_passed'] += 1
                    else:
                        rejection_stats['fcrdns_failed_rejected'] += 1
            else:
                if LENIENT_MODE:
                    accept_record = True
                    rejection_stats['invalid_ip_accepted'] += 1
            
            if accept_record:
                total_phase3_before_filter += 1
                
                # Check for search-API URL
                if is_search_api_url(record['request_uri']):
                    search_api_filtered += 1
                    continue
                
                total_phase3_after_filter += 1
                
                if current_file_records >= MAX_RECORDS_PER_FILE:
                    current_csvfile.close()
                    current_file_index += 1
                    current_csvfile, current_writer = create_output_file(current_file_index)
                    current_file_records = 0
                
                current_writer.writerow({
                    'time_iso8601': record['time_iso8601'],
                    'request_uri': record['request_uri'],
                    'status': record['status'],
                    'http_user_agent': record['http_user_agent'],
                    'http_x_forwarded_for': record['http_x_forwarded_for'],
                    'geoip_country_code': record['geoip_country_code'],
                    'upstream_response_time': record['upstream_response_time'],
                    'bytes_sent': record['bytes_sent']
                })
                current_file_records += 1
        
        if current_csvfile and not current_csvfile.closed:
            current_csvfile.close()
    
    elapsed_write = time.time() - start_write
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ PHASE 3 COMPLETE")
    print(f"{'=' * 80}")
    print(f"   Before search-API filter: {total_phase3_before_filter:,}")
    print(f"   Search-API URLs filtered: {search_api_filtered:,}")
    print(f"   After search-API filter: {total_phase3_after_filter:,}")
    print(f"   Breakdown:")
    print(f"   - CIDR verified: {rejection_stats['in_cidr']:,}")
    print(f"   - Missing IP (lenient): {rejection_stats['missing_ip_accepted']:,}")
    print(f"   - Invalid IP (lenient): {rejection_stats['invalid_ip_accepted']:,}")
    print(f"   - FcrDNS passed: {rejection_stats['fcrdns_passed']:,}")
    print(f"   - FcrDNS failed (rejected): {rejection_stats['fcrdns_failed_rejected']:,}")
    print(f"   Output files: {len(phase3_files)}")
    
    # ========================================================================
    # SUMMARY
    # ========================================================================
    total_time = time.time() - start_phase1
    
    gsc_expected = 14913024
    match_rate = (total_phase3_after_filter / gsc_expected * 100) if total_phase3_after_filter > 0 else 0
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ COMPLETE: IN-MEMORY 3-PHASE EXTRACTION")
    print(f"{'=' * 80}")
    
    print(f"\n‚è±Ô∏è  PERFORMANCE:")
    print(f"   Phase 1 (UA Filter): {elapsed_phase1:.1f}s ({elapsed_phase1/60:.1f} min)")
    print(f"   Phase 2 (CIDR Check): {elapsed_phase2:.1f}s ({elapsed_phase2/60:.1f} min)")
    print(f"   Phase 3 (FcrDNS + Write): {elapsed_write:.1f}s ({elapsed_write/60:.1f} min)")
    print(f"   TOTAL: {total_time:.1f}s ({total_time/60:.1f} min)")
    
    print(f"\nüìä RESULTS:")
    print(f"   Phase 1: {len(googlebot_records):,} (Googlebot UA)")
    print(f"   Phase 2: {len(unique_ips_phase1):,} unique IPs checked")
    print(f"   Phase 3 (before search-API): {total_phase3_before_filter:,}")
    print(f"   Phase 3 (after search-API): {total_phase3_after_filter:,} ‚úÖ")
    
    print(f"\nü§ñ GOOGLEBOT VARIANTS:")
    for variant, count in sorted(googlebot_variants.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"   ‚Ä¢ {variant}: {count:,}")
    
    print(f"\nüéØ GSC COMPARISON:")
    print(f"   GSC: {gsc_expected:,}")
    print(f"   Output: {total_phase3_after_filter:,}")
    print(f"   Difference: {abs(gsc_expected - total_phase3_after_filter):,}")
    print(f"   Match: {match_rate:.1f}%")
    
    if match_rate >= 98:
        print(f"   üèÜ NEAR-PERFECT!")
    elif match_rate >= 95:
        print(f"   ‚úÖ EXCELLENT MATCH!")
    elif match_rate >= 90:
        print(f"   ‚úÖ VERY GOOD!")
    
    # Create ZIP
    create_zip = True
    if create_zip:
        zip_filename = create_zip_archive(phase3_files, output_folder, "googlebot_final")
    
    return {
        'status': 'success',
        'output_folder': abs_output_folder,
        'phase3_files': phase3_files,
        'final_records': total_phase3_after_filter,
        'search_api_filtered': search_api_filtered,
        'match_rate': match_rate,
        'rejection_stats': rejection_stats
    }


In [None]:
def execute_phase3_2(input_folder, output_folder, crawl_threshold=10):
    """
    Phase 3.2: Host Page Counting
    Identifies URLs crawled at least once every N days with host eligibility criteria
    
    Parameters:
    -----------
    input_folder : str
        Path to folder containing Phase 2 output CSV files
    output_folder : str
        Directory where host_pages.csv will be saved
    crawl_threshold : int
        Maximum gap in days between crawls (default: 10)
    
    Returns:
    --------
    dict
        {
            'status': 'success' or 'error',
            'host_pages': count of host pages identified,
            'output_file': path to output file,
            'dataframe': host_pages DataFrame (optional),
            'total_pages': total unique pages analyzed
        }
    """
    
    import pandas as pd
    import numpy as np
    import os
    from datetime import timedelta
    import matplotlib.pyplot as plt
    import ipywidgets as widgets
    from IPython.display import display, clear_output
    from tqdm import tqdm
    import warnings
    warnings.filterwarnings('ignore')
    
    max_records_per_file = 500000
    
    os.makedirs(output_folder, exist_ok=True)
    
    print("üìñ Loading crawl data...")
    dfs = []
    
    if os.path.isdir(input_folder):
        files = [f for f in os.listdir(input_folder) 
                if f.endswith(('.csv', '.xlsx', '.xls')) 
                and not f.startswith('~$')
                and not f.startswith('.')]
        
        if not files:
            print(f"‚ùå No CSV/Excel files found in folder: {input_folder}")
            return {
                'status': 'error',
                'error': f'No CSV/Excel files found in folder: {input_folder}'
            }
            
        print(f"üìÅ Found {len(files)} file(s) in folder")
        
        for file in tqdm(files, desc="Reading files"):
            file_path = os.path.join(input_folder, file)
            try:
                if file.endswith('.csv'):
                    temp_df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)
                else:
                    temp_df = pd.read_excel(file_path)
                dfs.append(temp_df)
            except Exception as e:
                print(f"‚ö†Ô∏è Error reading {file}: {str(e)[:100]}")
                continue
        
        if not dfs:
            print("‚ùå No data loaded from folder")
            return {
                'status': 'error',
                'error': 'No data loaded from folder'
            }
            
        print("   ‚îú‚îÄ Concatenating DataFrames...")
        df = pd.concat(dfs, ignore_index=True)
        print(f"‚úÖ Loaded {len(df):,} records from {len(dfs)} file(s)")
        
    else:
        if not os.path.exists(input_folder):
            print(f"‚ùå File/folder not found: {input_folder}")
            return {
                'status': 'error',
                'error': f'File/folder not found: {input_folder}'
            }
            
        df = pd.read_excel(input_folder) if input_folder.endswith('.xlsx') else pd.read_csv(input_folder)
        print(f"‚úÖ Loaded {len(df):,} records from single file")
    
    # Memory optimization
    print("‚ö° Optimizing memory...")
    progress_mem = widgets.IntProgress(value=0, min=0, max=100, description='Memory:', bar_style='info')
    display(progress_mem)
    
    progress_mem.value = 50
    df = df[['request_uri', 'time_iso8601', 'http_user_agent', 'status']].copy()
    progress_mem.value = 100
    
    # Vectorized URL cleaning
    print("‚ö° Processing URLs...")
    progress_url = widgets.IntProgress(value=0, min=0, max=100, description='URLs:', bar_style='info')
    display(progress_url)
    
    progress_url.value = 33
    df['timestamp'] = pd.to_datetime(df['time_iso8601'])
    progress_url.value = 66
    df['url_clean'] = df['request_uri'].str.split('?').str[0]
    df['url'] = 'https://www.alamy.com' + df['url_clean'].fillna('')
    progress_url.value = 100
    
    # Drop intermediate columns
    df = df[['url', 'timestamp', 'http_user_agent', 'status']].copy()
    
    min_date, max_date = df['timestamp'].min(), df['timestamp'].max()
    period_days = (max_date - min_date).days + 1
    total_pages = df['url'].nunique()
    
    print(f"üìÖ Period: {min_date.date()} to {max_date.date()} ({period_days} days)")
    print(f"üìä Total unique pages: {total_pages:,}")
    
    print("\n‚öôÔ∏è Calculating page statistics...")
    
    # Step 1: First groupby - counts only
    print("   ‚îú‚îÄ Step 1/6: Aggregating crawl counts (lightweight)...")
    progress_group1 = widgets.IntProgress(value=0, min=0, max=100, description='Counts:', bar_style='info')
    display(progress_group1)
    
    progress_group1.value = 30
    url_stats = df.groupby('url', sort=False).agg({
        'timestamp': ['count', 'min', 'max']
    }).reset_index()
    url_stats.columns = ['url', 'crawl_count', 'first_crawl', 'last_crawl']
    progress_group1.value = 100
    print(f"   ‚îÇ     ‚îî‚îÄ ‚úÖ Aggregated {len(url_stats):,} unique URLs")
    
    # Step 2: Filter to only URLs with ‚â•3 crawls
    print("   ‚îú‚îÄ Step 2/6: Pre-filtering URLs (‚â•3 crawls)...")
    progress_filter = widgets.IntProgress(value=0, min=0, max=100, description='Filtering:', bar_style='info')
    display(progress_filter)
    
    progress_filter.value = 50
    candidate_urls = url_stats[url_stats['crawl_count'] >= 3]['url'].tolist()
    df_candidates = df[df['url'].isin(candidate_urls)].copy()
    progress_filter.value = 100
    print(f"   ‚îÇ     ‚îî‚îÄ ‚úÖ Filtered to {len(candidate_urls):,} candidates ({len(df_candidates):,} records)")
    
    # Step 3: Create timestamp lists
    print("   ‚îú‚îÄ Step 3/6: Grouping timestamps (filtered set)...")
    progress_group2 = widgets.IntProgress(value=0, min=0, max=100, description='Timestamps:', bar_style='info')
    display(progress_group2)
    
    progress_group2.value = 30
    timestamp_groups = df_candidates.groupby('url', sort=False)['timestamp'].apply(list).reset_index()
    timestamp_groups.columns = ['url', 'timestamps']
    progress_group2.value = 100
    print(f"   ‚îÇ     ‚îî‚îÄ ‚úÖ Created timestamp lists for {len(timestamp_groups):,} URLs")
    
    # Step 4: Get user agents
    print("   ‚îú‚îÄ Step 4/6: Extracting user agents...")
    progress_ua = widgets.IntProgress(value=0, min=0, max=100, description='User Agents:', bar_style='info')
    display(progress_ua)
    
    progress_ua.value = 50
    user_agent_groups = df_candidates.groupby('url', sort=False)['http_user_agent'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0]
    ).reset_index()
    user_agent_groups.columns = ['url', 'user_agent']
    progress_ua.value = 100
    print(f"   ‚îÇ     ‚îî‚îÄ ‚úÖ Extracted user agents for {len(user_agent_groups):,} URLs")
    
    # Step 5: Extract status codes
    print("   ‚îú‚îÄ Step 5/6: Extracting status codes...")
    progress_status = widgets.IntProgress(value=0, min=0, max=100, description='Status Codes:', bar_style='info')
    display(progress_status)
    
    progress_status.value = 50
    status_groups = df_candidates.groupby('url', sort=False)['status'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else 200
    ).reset_index()
    status_groups.columns = ['url', 'status_code']
    progress_status.value = 100
    print(f"   ‚îÇ     ‚îî‚îÄ ‚úÖ Extracted status codes for {len(status_groups):,} URLs")
    
    # Merge all stats
    url_groups = url_stats.merge(timestamp_groups, on='url', how='left')
    url_groups = url_groups.merge(user_agent_groups, on='url', how='left')
    url_groups = url_groups.merge(status_groups, on='url', how='left')
    url_groups['status_code'] = url_groups['status_code'].fillna(200)
    
    url_groups['crawl_frequency'] = url_groups['crawl_count'] / period_days
    
    # Step 6: Host eligibility check
    print(f"   ‚îî‚îÄ Step 6/6: Checking host eligibility (‚â•3 crawls + max {crawl_threshold}-day gap)...")
    
    def check_host_eligibility_fast(timestamps, min_crawls=3, max_gap_days=crawl_threshold):
        if timestamps is None or not isinstance(timestamps, list) or len(timestamps) < min_crawls:
            return False
        sorted_ts = sorted(timestamps)
        gaps = [(sorted_ts[i+1] - sorted_ts[i]).days for i in range(len(sorted_ts) - 1)]
        return all(gap <= max_gap_days for gap in gaps)
    
    tqdm.pandas(desc="   ‚îÇ     ‚îî‚îÄ Validating intervals")
    url_groups['meets_interval'] = url_groups['timestamps'].progress_apply(check_host_eligibility_fast)
    
    # Classification
    print("\n‚úÖ Classifying host pages...")
    url_groups['is_host'] = (url_groups['crawl_count'] >= 3) & (url_groups['meets_interval'])
    
    url_groups['authority_score'] = url_groups['crawl_count']
    
    host_pages = url_groups[url_groups['is_host']].copy()
    host_pages = host_pages.sort_values('authority_score', ascending=False)
    
    print(f"   ‚îî‚îÄ ‚úÖ Identified {len(host_pages):,} host pages ({len(host_pages)/total_pages*100:.2f}% of total)")
    
    # Analyze Googlebot types
    print("\nü§ñ Analyzing Googlebot types...")
    
    def extract_bot_type(user_agent):
        if pd.isna(user_agent):
            return 'Unknown'
        
        ua_str = str(user_agent).lower()
        
        if 'smartphone' in ua_str and 'googlebot' in ua_str:
            return 'Googlebot-Smartphone'
        elif 'googlebot-image' in ua_str:
            return 'Googlebot-Image'
        elif 'googlebot-video' in ua_str:
            return 'Googlebot-Video'
        elif 'googlebot-news' in ua_str:
            return 'Googlebot-News'
        elif 'adsbot' in ua_str:
            return 'AdsBot-Google'
        elif 'mediapartners' in ua_str:
            return 'Mediapartners-Google'
        elif 'mobile' in ua_str and 'googlebot' in ua_str:
            return 'Googlebot-Mobile-Legacy' 
        elif 'googlebot' in ua_str:
            return 'Googlebot-Desktop'
        elif 'google' in ua_str:
            return 'Other-Google-Bot'
        else:
            return 'Non-Google'
    
    df['bot_type'] = df['http_user_agent'].apply(extract_bot_type)
    bot_stats = df.groupby('bot_type')['url'].nunique().reset_index()
    bot_stats.columns = ['bot_type', 'unique_urls']
    bot_stats = bot_stats.sort_values('unique_urls', ascending=False)
    
    print(f"   ‚îî‚îÄ ‚úÖ Analyzed {len(bot_stats)} bot types")
    
    # Visualizations
    print("\nüìä Generating visualizations...")
    progress_viz = widgets.IntProgress(value=0, min=0, max=100, description='Visuals:', bar_style='warning')
    display(progress_viz)
    
    progress_viz.value = 20
    df['date'] = df['timestamp'].dt.date
    host_urls = set(host_pages['url'].tolist())
    df['is_host_crawl'] = df['url'].isin(host_urls)
    
    progress_viz.value = 40
    daily_stats = df.groupby('date').agg({
        'url': 'nunique',
        'is_host_crawl': 'sum'
    }).reset_index()
    daily_stats.columns = ['date', 'total_pages_crawled', 'host_pages_crawled']
    progress_viz.value = 60
    
    output1 = widgets.Output()
    
    with output1:
        fig, ax = plt.subplots(2, 1, figsize=(14, 10))
        
        ax[0].plot(daily_stats['date'], daily_stats['total_pages_crawled'], 
                   label='Total Pages Crawled', linewidth=2, color='steelblue', alpha=0.7)
        ax[0].plot(daily_stats['date'], daily_stats['host_pages_crawled'], 
                   label='Host Pages Crawled', linewidth=2, color='orangered')
        ax[0].set_xlabel('Date', fontsize=12)
        ax[0].set_ylabel('Page Count', fontsize=12)
        ax[0].set_title('Host Pages vs Total Pages Crawled Over Time', fontsize=14, fontweight='bold')
        ax[0].legend(loc='upper left', fontsize=10)
        ax[0].grid(True, alpha=0.3)
        plt.setp(ax[0].xaxis.get_majorticklabels(), rotation=45)
        
        daily_stats['host_pct'] = (daily_stats['host_pages_crawled'] / daily_stats['total_pages_crawled']) * 100
        ax[1].fill_between(daily_stats['date'], daily_stats['host_pct'], 
                           color='mediumseagreen', alpha=0.5, label='Host Page %')
        ax[1].plot(daily_stats['date'], daily_stats['host_pct'], 
                   color='darkgreen', linewidth=2)
        ax[1].axhline(y=daily_stats['host_pct'].mean(), color='red', 
                      linestyle='--', label=f'Avg: {daily_stats["host_pct"].mean():.1f}%')
        ax[1].set_xlabel('Date', fontsize=12)
        ax[1].set_ylabel('Host Page %', fontsize=12)
        ax[1].set_title('Host Page Crawl Percentage Over Time', fontsize=14, fontweight='bold')
        ax[1].legend(loc='upper left', fontsize=10)
        ax[1].grid(True, alpha=0.3)
        plt.setp(ax[1].xaxis.get_majorticklabels(), rotation=45)
        
        plt.tight_layout()
        plt.show()
    
    output2 = widgets.Output()
    
    with output2:
        fig, ax = plt.subplots(1, 2, figsize=(16, 6))
        
        top_10 = host_pages.head(10)
        colors_gradient = plt.cm.Reds(np.linspace(0.4, 0.9, len(top_10)))
        
        ax[0].barh(range(len(top_10)), top_10['crawl_count'].values, color=colors_gradient)
        ax[0].set_yticks(range(len(top_10)))
        ax[0].set_yticklabels([f"{i+1}. {url.split('/')[-1][:30]}..." for i, url in enumerate(top_10['url'].values)], fontsize=9)
        ax[0].set_xlabel('Crawl Count', fontsize=12, fontweight='bold')
        ax[0].set_title('Top 10 Host Pages by Crawl Count', fontsize=14, fontweight='bold')
        ax[0].invert_yaxis()
        ax[0].grid(axis='x', alpha=0.3)
        
        for i, v in enumerate(top_10['crawl_count'].values):
            ax[0].text(v + max(top_10['crawl_count'])*0.01, i, f'{v:.0f}', 
                      va='center', fontsize=9, fontweight='bold')
        
        colors_bot = plt.cm.Blues(np.linspace(0.4, 0.9, len(bot_stats)))
        
        ax[1].bar(range(len(bot_stats)), bot_stats['unique_urls'].values, color=colors_bot, edgecolor='black', linewidth=1)
        ax[1].set_xticks(range(len(bot_stats)))
        ax[1].set_xticklabels(bot_stats['bot_type'].values, rotation=45, ha='right', fontsize=9)
        ax[1].set_ylabel('Unique URLs Crawled', fontsize=12, fontweight='bold')
        ax[1].set_title('Googlebot Types - URL Coverage', fontsize=14, fontweight='bold')
        ax[1].grid(axis='y', alpha=0.3)
        
        for i, v in enumerate(bot_stats['unique_urls'].values):
            ax[1].text(i, v + max(bot_stats['unique_urls'])*0.01, f'{v:,}', 
                      ha='center', va='bottom', fontsize=9, fontweight='bold')
        
        plt.tight_layout()
        plt.show()
    
    progress_viz.value = 80
    
    output3 = widgets.Output()
    
    with output3:
        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
        
        categories = ['Host Pages', 'Regular Pages']
        counts = [len(host_pages), total_pages - len(host_pages)]
        colors = ['#ff6b6b', '#4ecdc4']
        
        ax.bar(categories, counts, color=colors, edgecolor='black', linewidth=1.5, width=0.6)
        ax.set_ylabel('Page Count', fontsize=12, fontweight='bold')
        ax.set_title('Host Pages vs Regular Pages Distribution', fontsize=14, fontweight='bold')
        ax.text(0, counts[0] + max(counts)*0.02, f'{counts[0]:,}', 
                   ha='center', fontsize=12, fontweight='bold')
        ax.text(1, counts[1] + max(counts)*0.02, f'{counts[1]:,}', 
                   ha='center', fontsize=12, fontweight='bold')
        ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    progress_viz.value = 100
    print("   ‚îî‚îÄ ‚úÖ Visualizations complete")
    
    # Save output
    print("\nüíæ Saving output files...")
    progress_save = widgets.IntProgress(value=0, min=0, max=100, description='Saving:', bar_style='success')
    display(progress_save)
    
    output_cols = ['url', 'crawl_count', 'crawl_frequency', 'user_agent', 'status_code']
    
    output_files = []
    
    if len(host_pages) <= max_records_per_file:
        progress_save.value = 50
        output_path = os.path.join(output_folder, 'host_pages.csv')
        host_pages[output_cols].to_csv(output_path, index=False, encoding='utf-8-sig')
        output_files.append(output_path)
        progress_save.value = 100
        print(f"   ‚îî‚îÄ Saved: {output_path}")
    else:
        num_parts = (len(host_pages) // max_records_per_file) + 1
        print(f"   ‚îú‚îÄ Splitting into {num_parts} parts...")
        for i in tqdm(range(num_parts), desc="   ‚îÇ  ‚îî‚îÄ Writing files"):
            start_idx = i * max_records_per_file
            end_idx = min((i + 1) * max_records_per_file, len(host_pages))
            part_df = host_pages.iloc[start_idx:end_idx]
            output_path = os.path.join(output_folder, f'host_pages_part{i+1}.csv')
            part_df[output_cols].to_csv(output_path, index=False, encoding='utf-8-sig')
            output_files.append(output_path)
            progress_save.value = int((i+1) / num_parts * 100)
        print(f"   ‚îî‚îÄ Saved {num_parts} files")
    
    print("\n" + "="*80)
    print("üìä PHASE 3.2 COMPLETE: HOST PAGE DETECTION")
    print("="*80)
    print(f"‚úÖ Host Page Criteria: ‚â•3 crawls + max {crawl_threshold}-day gap")
    print(f"‚úÖ Output: url, crawl_count, crawl_frequency, user_agent, status_code")
    print(f"‚úÖ Analyzed {len(bot_stats)} different bot types")
    print("="*80)
    
    display(output1)
    display(output2)
    display(output3)
    
    return {
        'status': 'success',
        'host_pages': len(host_pages),
        'total_pages': total_pages,
        'output_file': output_files[0] if output_files else None,
        'output_files': output_files,
        'dataframe': host_pages
    }


In [None]:
def execute_phase3_4(input_folder, output_folder, sitemap_urls, stale_threshold=30, 
                     percentile=10.0, workers=10, log_batch_size=5, test_mode=False):
    """
    Phase 3.4: Stale Page Detection (Party Bouncer)
    Identifies orphan pages and low-activity pages from sitemap analysis
    
    Parameters:
    -----------
    input_folder : str
        Path to folder containing Phase 2 output (crawl log CSV files)
    output_folder : str
        Directory where stale_pages.csv will be saved
    sitemap_urls : str or list
        Sitemap index URL(s) - can be single URL or comma/newline separated list
    stale_threshold : int
        Days since last modification to consider stale (default: 30)
    percentile : float
        Bottom percentile for low activity detection (default: 10.0)
    workers : int
        Number of parallel workers for sitemap parsing (default: 10)
    log_batch_size : int
        Number of log files to process per batch (default: 5)
    test_mode : bool
        If True, limits to 10 gz files per index for testing (default: False)
    
    Returns:
    --------
    dict
        {
            'status': 'success' or 'error',
            'orphans': count of orphan pages,
            'low_activity': count of low activity pages,
            'total_stale': total stale pages,
            'output_files': list of output file paths,
            'dataframe': stale_pages DataFrame (optional)
        }
    """
    
    import pandas as pd
    import numpy as np
    import requests
    import xml.etree.ElementTree as ET
    import gzip
    from io import BytesIO
    import os
    import concurrent.futures
    from tqdm.notebook import tqdm
    import warnings
    import gc
    import csv
    warnings.filterwarnings('ignore')
    
    # Worker function for parallel sitemap parsing
    def parse_gz_sitemap_worker(gz_url):
        try:
            response = requests.get(gz_url, timeout=60, stream=True)
            if response.status_code != 200:
                return []
            with gzip.GzipFile(fileobj=BytesIO(response.content)) as gz:
                xml_content = gz.read()
            root = ET.fromstring(xml_content)
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            urls_data = []
            for url_elem in root.findall('sm:url', ns):
                loc = url_elem.find('sm:loc', ns)
                lastmod = url_elem.find('sm:lastmod', ns)
                if loc is not None:
                    urls_data.append({
                        'url': loc.text,
                        'last_modified': lastmod.text if lastmod is not None else None
                    })
            return urls_data
        except:
            return []
    
    max_records_per_file = 500000
    os.makedirs(output_folder, exist_ok=True)
    
    # Parse sitemap URLs
    if isinstance(sitemap_urls, str):
        sitemap_list = [url.strip() for url in sitemap_urls.replace(',', '\n').split('\n') if url.strip()]
    else:
        sitemap_list = sitemap_urls
    
    if not sitemap_list:
        return {
            'status': 'error',
            'error': 'No sitemap URLs provided'
        }

    print("="*80)
    print(f"üï∫ STALE PAGE DETECTION: THE PARTY BOUNCER (Parallel: {workers} workers)")
    print("="*80)
    print(f"üìÅ Log folder: {input_folder}")
    print(f"üó∫Ô∏è  Sitemap indexes: {len(sitemap_list)}")
    print(f"üìä Low activity threshold: Bottom {percentile}%")
    print(f"‚è±Ô∏è  Stale threshold: {stale_threshold} days")
    
    # STEP 1: Build guest list from logs
    print("\nüìñ Step 1: Building the Guest List (Processing Logs)...")
    
    if not os.path.exists(input_folder):
        return {
            'status': 'error',
            'error': f'Folder not found: {input_folder}'
        }

    log_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    
    if not log_files:
        return {
            'status': 'error',
            'error': 'No CSV files found in input folder'
        }

    all_log_stats = []
    
    for i in tqdm(range(0, len(log_files), log_batch_size), desc="   Processing Logs"):
        batch_files = log_files[i:i+log_batch_size]
        dfs = []
        for file in batch_files:
            try:
                df = pd.read_csv(os.path.join(input_folder, file), 
                               usecols=['request_uri', 'http_user_agent', 'time_iso8601', 'status'], 
                               encoding='utf-8-sig', low_memory=False)
                dfs.append(df)
            except: 
                continue
            
        if dfs:
            batch_df = pd.concat(dfs)
            batch_df['url'] = 'https://www.alamy.com' + batch_df['request_uri'].str.split('?').str[0].fillna('')
            batch_df['timestamp'] = pd.to_datetime(batch_df['time_iso8601'], errors='coerce')
            
            min_date = batch_df['timestamp'].min()
            max_date = batch_df['timestamp'].max()
            days = (max_date - min_date).days + 1 if pd.notnull(max_date) else 1
            
            batch_stats = batch_df.groupby('url').agg({
                'request_uri': 'count',
                'http_user_agent': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
                'status': lambda x: x.mode()[0] if len(x.mode()) > 0 else 200
            }).reset_index()
            batch_stats.columns = ['url', 'crawl_count', 'user_agent', 'status_code']
            batch_stats['days_active'] = days
            
            all_log_stats.append(batch_stats)
            del batch_df, dfs
            gc.collect()

    if not all_log_stats:
        return {
            'status': 'error',
            'error': 'No log data processed'
        }

    print("   ‚îú‚îÄ Finalizing Guest List...")
    full_log_stats = pd.concat(all_log_stats).groupby('url').agg({
        'crawl_count': 'sum',
        'user_agent': 'first',
        'status_code': 'first',
        'days_active': 'max'
    }).reset_index()
    
    full_log_stats['crawl_frequency'] = full_log_stats['crawl_count'] / full_log_stats['days_active']
    full_log_stats['authority_score'] = (full_log_stats['crawl_count'] * 0.7 + 
                                         full_log_stats['crawl_frequency'] * full_log_stats['days_active'] * 0.3)
    
    threshold = full_log_stats['crawl_count'].quantile(percentile / 100.0)
    print(f"   ‚îú‚îÄ Bottom {percentile}% Threshold: <= {threshold} crawls")
    
    invited_guests = set(full_log_stats['url'])
    wallflowers_df = full_log_stats[full_log_stats['crawl_count'] <= threshold]
    wallflowers = set(wallflowers_df['url'])
    
    print(f"‚úÖ Guest List Ready: {len(invited_guests):,} total, {len(wallflowers):,} low activity")

    # STEP 2: Parse sitemaps
    print("\nüì• Step 2: Checking Sitemaps at the Gate (Parallel)...")
    
    def parse_sitemap_index(index_url):
        try:
            response = requests.get(index_url, timeout=60)
            root = ET.fromstring(response.content)
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            return [loc.text for loc in root.findall('sm:sitemap/sm:loc', ns)]
        except:
            return []

    stale_party_list = []
    total_processed_urls = 0
    
    def check_at_gate(batch_df):
        batch_df['is_invited'] = batch_df['url'].isin(invited_guests)
        batch_df['is_wallflower'] = batch_df['url'].isin(wallflowers)
        
        orphans = batch_df[~batch_df['is_invited']].copy()
        orphans['page_type'] = 'Orphan'
        orphans['crawl_count'] = 0
        orphans['authority_score'] = 0.0
        orphans['crawl_frequency'] = 0.0
        orphans['user_agent'] = 'Not Crawled'
        orphans['status_code'] = 0
        
        low_act = batch_df[batch_df['is_wallflower']].copy()
        low_act['page_type'] = 'Low Activity'
        
        if not low_act.empty:
            low_act = low_act.merge(
                full_log_stats[['url', 'crawl_count', 'authority_score', 'crawl_frequency', 'user_agent', 'status_code']], 
                on='url', how='left'
            )
            low_act['status_code'] = low_act['status_code'].fillna(200)
        
        return pd.concat([orphans, low_act])

    max_gz = 10 if test_mode else None
    
    for idx_num, index_url in enumerate(sitemap_list, 1):
        print(f"\n   ‚îú‚îÄ Index {idx_num}/{len(sitemap_list)}: {index_url.split('/')[-1]}")
        gz_urls = parse_sitemap_index(index_url)
        if not gz_urls: 
            continue
        if max_gz: 
            gz_urls = gz_urls[:max_gz]
            
        print(f"   ‚îÇ  ‚îú‚îÄ Spawning {workers} workers for {len(gz_urls)} files...")
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            futures = {executor.submit(parse_gz_sitemap_worker, url): url for url in gz_urls}
            batch_results = []
            
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(gz_urls), desc="   ‚îÇ  ‚îî‚îÄ Parsing", leave=False):
                result = future.result()
                if result: 
                    batch_results.extend(result)
                
                if len(batch_results) > 200000:
                    batch_df = pd.DataFrame(batch_results)
                    total_processed_urls += len(batch_df)
                    processed_batch = check_at_gate(batch_df)
                    if not processed_batch.empty:
                        stale_party_list.append(processed_batch)
                    batch_results = []
                    del batch_df
                    gc.collect()
            
            if batch_results:
                batch_df = pd.DataFrame(batch_results)
                total_processed_urls += len(batch_df)
                processed_batch = check_at_gate(batch_df)
                if not processed_batch.empty:
                    stale_party_list.append(processed_batch)
                del batch_df
                gc.collect()

    # STEP 3: Save output
    print("\nüíæ Step 3: Consolidating and Saving...")
    
    if not stale_party_list:
        return {
            'status': 'error',
            'error': 'No stale pages found'
        }
        
    final_df = pd.concat(stale_party_list, ignore_index=True)
    
    current_date = pd.Timestamp.now()
    final_df['last_modified'] = pd.to_datetime(final_df['last_modified'], errors='coerce')
    final_df['days_since_modified'] = (current_date - final_df['last_modified']).dt.days
    
    final_df['priority_score'] = 0
    final_df.loc[final_df['days_since_modified'] > 180, 'priority_score'] = 100
    final_df.loc[(final_df['days_since_modified'] > 90) & (final_df['days_since_modified'] <= 180), 'priority_score'] = 70
    final_df.loc[final_df['days_since_modified'] <= 90, 'priority_score'] = 40
    
    final_df = final_df.sort_values(['page_type', 'priority_score'], ascending=[False, False])
    
    output_cols = ['url', 'crawl_count', 'authority_score', 'crawl_frequency', 'user_agent', 
                   'status_code', 'page_type', 'priority_score', 'days_since_modified', 'last_modified']
    
    final_cols = [c for c in output_cols if c in final_df.columns]
    final_df = final_df[final_cols]
    
    output_files = []
    
    if len(final_df) <= max_records_per_file:
        output_path = os.path.join(output_folder, 'stale_pages.csv')
        final_df.to_csv(output_path, index=False, encoding='utf-8-sig', quoting=csv.QUOTE_ALL)
        output_files.append(output_path)
        print(f"   ‚îî‚îÄ Saved: {output_path}")
    else:
        num_parts = (len(final_df) // max_records_per_file) + 1
        for i in range(num_parts):
            start_idx = i * max_records_per_file
            end_idx = min((i + 1) * max_records_per_file, len(final_df))
            part_df = final_df.iloc[start_idx:end_idx]
            output_path = os.path.join(output_folder, f'stale_pages_part{i+1}.csv')
            part_df.to_csv(output_path, index=False, encoding='utf-8-sig', quoting=csv.QUOTE_ALL)
            output_files.append(output_path)
        print(f"   ‚îî‚îÄ Saved {num_parts} files")

    orphan_count = len(final_df[final_df['page_type']=='Orphan'])
    low_activity_count = len(final_df[final_df['page_type']=='Low Activity'])
    
    print("\n" + "="*80)
    print("üìä FINAL STATS")
    print("="*80)
    print(f"‚úÖ Total Analyzed: {total_processed_urls:,}")
    print(f"‚úÖ Stale Found: {len(final_df):,}")
    print(f"   ‚Ä¢ Orphans: {orphan_count:,}")
    print(f"   ‚Ä¢ Low Activity: {low_activity_count:,}")
    print(f"üìÅ Output Location: {os.path.abspath(output_folder)}")
    print("="*80)
    
    return {
        'status': 'success',
        'orphans': orphan_count,
        'low_activity': low_activity_count,
        'total_stale': len(final_df),
        'output_files': output_files,
        'dataframe': final_df
    }


In [None]:
def execute_phase4_1(input_folder, output_folder, model_name='all-MiniLM-L6-v2', batch_size=32):
    """
    Phase 4.1: Generate Semantic Embeddings
    Creates vector embeddings for host and stale pages for similarity computation
    
    Parameters:
    -----------
    input_folder : str
        Path to folder containing host_pages.csv and stale_pages*.csv from Phase 3
    output_folder : str
        Directory where embeddings will be saved
    model_name : str
        SentenceTransformer model name (default: 'all-MiniLM-L6-v2')
    batch_size : int
        Batch size for encoding (default: 32)
    
    Returns:
    --------
    dict
        {
            'status': 'success' or 'error',
            'host_count': number of host embeddings,
            'stale_count': number of stale embeddings,
            'output_folder': path to output folder,
            'output_files': list of generated files
        }
    """
    
    import pandas as pd
    import numpy as np
    from sentence_transformers import SentenceTransformer
    from urllib.parse import urlparse
    import json
    import os
    from datetime import datetime
    import glob
    
    print("=" * 60)
    print("Phase 4.1: Semantic Embedding Generation")
    print("=" * 60)
    
    # Create output directory
    os.makedirs(output_folder, exist_ok=True)
    
    # ========================================
    # Step 1: Load Phase 3 Data
    # ========================================
    print("\n[1/5] Loading Phase 3 data...")
    
    # Load host pages
    host_pages_path = os.path.join(input_folder, 'host_pages.csv')
    
    if not os.path.exists(host_pages_path):
        return {
            'status': 'error',
            'error': f'Host pages file not found: {host_pages_path}'
        }
    
    try:
        host_pages = pd.read_csv(host_pages_path)
        print(f"‚úì Loaded {len(host_pages):,} host pages from {host_pages_path}")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to load host_pages.csv: {str(e)}'
        }
    
    # Load stale pages (handle single file or multiple parts)
    stale_pages_files = []
    
    # Check for single file
    single_file = os.path.join(input_folder, 'stale_pages.csv')
    if os.path.exists(single_file):
        stale_pages_files.append(single_file)
    else:
        # Check for multiple parts (stale_pages_part*.csv)
        pattern = os.path.join(input_folder, 'stale_pages_part*.csv')
        stale_pages_files = sorted(glob.glob(pattern))
    
    if not stale_pages_files:
        return {
            'status': 'error',
            'error': f'No stale pages files found in {input_folder}'
        }
    
    # Load and concatenate all stale page files
    print(f"  Found {len(stale_pages_files)} stale page file(s):")
    stale_pages_list = []
    for file in stale_pages_files:
        try:
            df = pd.read_csv(file)
            stale_pages_list.append(df)
            print(f"    - {os.path.basename(file)}: {len(df):,} rows")
        except Exception as e:
            print(f"    ‚ö†Ô∏è Error reading {os.path.basename(file)}: {str(e)}")
            continue
    
    if not stale_pages_list:
        return {
            'status': 'error',
            'error': 'Failed to load any stale pages files'
        }
    
    stale_pages = pd.concat(stale_pages_list, ignore_index=True)
    print(f"‚úì Loaded total {len(stale_pages):,} stale pages (combined)")
    
    # Remove duplicates if any
    original_count = len(stale_pages)
    stale_pages = stale_pages.drop_duplicates(subset=['url'], keep='first')
    if len(stale_pages) < original_count:
        print(f"  ‚Ñπ Removed {original_count - len(stale_pages):,} duplicate URLs")
    
    print(f"\n‚úì Configuration complete")
    print(f"  Input folder: {input_folder}")
    print(f"  Output folder: {output_folder}")
    print(f"  Model: {model_name}")
    print(f"  Batch size: {batch_size}")
    
    # ========================================
    # Step 2: Extract Text from URLs
    # ========================================
    print("\n[2/5] Extracting text from URLs...")
    
    def extract_text_from_url(url):
        """
        Extract meaningful keywords from URL path for embedding.
        """
        try:
            parsed = urlparse(str(url))
            path = parsed.path.strip('/')
            
            # Remove common file extensions
            path = path.replace('.html', '').replace('.php', '').replace('.htm', '')
            
            # Replace separators with spaces
            text = path.replace('/', ' ').replace('-', ' ').replace('_', ' ')
            
            # Clean whitespace
            text = ' '.join(text.split())
            
            return text if text else 'unknown page'
        except:
            return 'unknown page'
    
    # Apply to both datasets
    host_pages['text'] = host_pages['url'].apply(extract_text_from_url)
    stale_pages['text'] = stale_pages['url'].apply(extract_text_from_url)
    
    print(f"‚úì Extracted text for {len(host_pages):,} host pages")
    print(f"‚úì Extracted text for {len(stale_pages):,} stale pages")
    
    print("\nText extraction examples:")
    for i in range(min(3, len(host_pages))):
        print(f"  URL:  {host_pages.iloc[i]['url'][:60]}...")
        print(f"  Text: {host_pages.iloc[i]['text']}")
        print()
    
    # ========================================
    # Step 3: Load Embedding Model
    # ========================================
    print("\n[3/5] Loading embedding model...")
    
    print(f"  Model: {model_name}")
    
    try:
        model = SentenceTransformer(model_name)
        print(f"‚úì Model loaded")
        print(f"  Embedding dimensions: {model.get_sentence_embedding_dimension()}")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to load model {model_name}: {str(e)}'
        }
    
    # ========================================
    # Step 4: Generate Embeddings
    # ========================================
    print("\n[4/5] Generating embeddings...")
    
    # Generate host embeddings
    print(f"\n  ‚Üí Computing embeddings for {len(host_pages):,} host pages...")
    try:
        host_embeddings = model.encode(
            host_pages['text'].tolist(),
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        print(f"  ‚úì Host embeddings shape: {host_embeddings.shape}")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to generate host embeddings: {str(e)}'
        }
    
    # Generate stale embeddings
    print(f"\n  ‚Üí Computing embeddings for {len(stale_pages):,} stale pages...")
    try:
        stale_embeddings = model.encode(
            stale_pages['text'].tolist(),
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        print(f"  ‚úì Stale embeddings shape: {stale_embeddings.shape}")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to generate stale embeddings: {str(e)}'
        }
    
    # ========================================
    # Step 5: Save Embeddings & Metadata
    # ========================================
    print("\n[5/5] Saving embeddings to disk...")
    
    output_files = []
    
    try:
        # Save embeddings as NumPy arrays
        host_emb_path = os.path.join(output_folder, 'host_embeddings.npy')
        stale_emb_path = os.path.join(output_folder, 'stale_embeddings.npy')
        
        np.save(host_emb_path, host_embeddings)
        np.save(stale_emb_path, stale_embeddings)
        
        output_files.extend([host_emb_path, stale_emb_path])
        
        print(f"‚úì Saved host_embeddings.npy ({host_embeddings.nbytes / 1024**2:.2f} MB)")
        print(f"‚úì Saved stale_embeddings.npy ({stale_embeddings.nbytes / 1024**2:.2f} MB)")
        
        # Save the combined stale_pages for Phase 4.2
        stale_combined_path = os.path.join(output_folder, 'stale_pages_combined.csv')
        stale_pages.to_csv(stale_combined_path, index=False)
        output_files.append(stale_combined_path)
        print(f"‚úì Saved stale_pages_combined.csv (for Phase 4.2)")
        
        # Save metadata for reproducibility
        metadata = {
            "model_name": model_name,
            "embedding_dimensions": int(model.get_sentence_embedding_dimension()),
            "host_pages_count": int(len(host_pages)),
            "stale_pages_count": int(len(stale_pages)),
            "stale_pages_source_files": [os.path.basename(f) for f in stale_pages_files],
            "batch_size": batch_size,
            "normalized": True,
            "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "input_files": {
                "host_pages": host_pages_path,
                "stale_pages": stale_pages_files
            },
            "output_files": {
                "host_embeddings": host_emb_path,
                "stale_embeddings": stale_emb_path,
                "stale_pages_combined": stale_combined_path
            }
        }
        
        metadata_path = os.path.join(output_folder, 'embedding_metadata.json')
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        output_files.append(metadata_path)
        print("‚úì Saved embedding_metadata.json")
        
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to save embeddings: {str(e)}'
        }
    
    # ========================================
    # Step 6: Verification & Summary
    # ========================================
    print("\n" + "=" * 60)
    print("Phase 4.1 Complete - Embedding Generation Summary")
    print("=" * 60)
    
    print("\nGenerated Files:")
    for filepath in output_files:
        size_mb = os.path.getsize(filepath) / 1024**2
        print(f"  ‚úì {os.path.basename(filepath)} ({size_mb:.2f} MB)")
    
    print("\nSanity Checks:")
    print(f"  Host embeddings shape: {host_embeddings.shape}")
    print(f"  Stale embeddings shape: {stale_embeddings.shape}")
    print(f"  Sample embedding (first 5 dims): {host_embeddings[0][:5]}")
    print(f"  Embedding norm (should be ~1.0): {np.linalg.norm(host_embeddings[0]):.4f}")
    
    print("\n‚úÖ Ready for Phase 4.2 (Similarity Matrix Computation)")
    print("=" * 60)
    
    return {
        'status': 'success',
        'host_count': len(host_pages),
        'stale_count': len(stale_pages),
        'output_folder': os.path.abspath(output_folder),
        'output_files': output_files,
        'embedding_dimensions': int(model.get_sentence_embedding_dimension())
    }


In [None]:
def execute_phase4_2(input_folder, output_folder, top_k=10, batch_size=1000):
    """
    Phase 4.2: Compute Top-K Similarity Matrix
    Finds the most similar host pages for each stale page using cosine similarity
    
    Parameters:
    -----------
    input_folder : str
        Path to folder containing host_embeddings.npy and stale_embeddings.npy from Phase 4.1
    output_folder : str
        Directory where similarity results will be saved (can be same as input_folder)
    top_k : int
        Number of top similar host pages to return per stale page (default: 10)
    batch_size : int
        Batch size for similarity computation to manage memory (default: 1000)
    
    Returns:
    --------
    dict
        {
            'status': 'success' or 'error',
            'similarity_pairs': total number of similarity pairs,
            'output_file': path to top_k_similarities.csv,
            'avg_similarity': average similarity score,
            'similarity_range': dict with min, max, mean, median
        }
    """
    
    import pandas as pd
    import numpy as np
    import json
    import os
    from tqdm import tqdm
    import gc
    
    print("=" * 60)
    print("Phase 4.2: Similarity Matrix Computation")
    print("=" * 60)
    
    os.makedirs(output_folder, exist_ok=True)
    
    print(f"\n‚úì Configuration:")
    print(f"  Input folder: {input_folder}")
    print(f"  Output folder: {output_folder}")
    print(f"  Top-K: {top_k}")
    print(f"  Batch size: {batch_size}")
    
    # ========================================
    # Step 1: Load Embeddings & Metadata
    # ========================================
    print("\n[1/4] Loading embeddings...")
    
    host_emb_path = os.path.join(input_folder, 'host_embeddings.npy')
    stale_emb_path = os.path.join(input_folder, 'stale_embeddings.npy')
    metadata_path = os.path.join(input_folder, 'embedding_metadata.json')
    
    # Check if files exist
    if not os.path.exists(host_emb_path):
        return {
            'status': 'error',
            'error': f'Host embeddings not found: {host_emb_path}'
        }
    
    if not os.path.exists(stale_emb_path):
        return {
            'status': 'error',
            'error': f'Stale embeddings not found: {stale_emb_path}'
        }
    
    try:
        host_embeddings = np.load(host_emb_path)
        stale_embeddings = np.load(stale_emb_path)
        
        print(f"‚úì Host embeddings: {host_embeddings.shape}")
        print(f"‚úì Stale embeddings: {stale_embeddings.shape}")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to load embeddings: {str(e)}'
        }
    
    # Load metadata if exists
    if os.path.exists(metadata_path):
        try:
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
            print(f"‚úì Model used: {metadata['model_name']}")
            print(f"‚úì Embedding dimensions: {metadata['embedding_dimensions']}")
        except:
            print(f"‚ö†Ô∏è Could not load metadata")
            metadata = {}
    else:
        metadata = {}
    
    # ========================================
    # Step 2: Compute Cosine Similarities (Batched)
    # ========================================
    print(f"\n[2/4] Computing cosine similarities (batched)...")
    print(f"  Processing {len(stale_embeddings):,} stale pages in batches of {batch_size}")
    
    # Storage for top-K results
    all_stale_indices = []
    all_host_indices = []
    all_similarities = []
    
    num_stale = len(stale_embeddings)
    num_batches = (num_stale + batch_size - 1) // batch_size
    
    print(f"  Total batches: {num_batches}")
    
    try:
        for batch_idx in tqdm(range(num_batches), desc="  Computing similarities"):
            # Get batch of stale embeddings
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, num_stale)
            
            stale_batch = stale_embeddings[start_idx:end_idx]
            
            # Compute cosine similarity: stale_batch @ host_embeddings.T
            # Shape: (batch_size, num_hosts)
            similarities = np.dot(stale_batch, host_embeddings.T)
            
            # For each stale page, get top-K host indices
            for i, sim_row in enumerate(similarities):
                stale_idx = start_idx + i
                
                # Get top-K indices (use argpartition for efficiency)
                if len(sim_row) <= top_k:
                    top_k_indices = np.argsort(sim_row)[::-1]
                else:
                    # Faster than full sort for large arrays
                    top_k_indices = np.argpartition(sim_row, -top_k)[-top_k:]
                    top_k_indices = top_k_indices[np.argsort(sim_row[top_k_indices])[::-1]]
                
                top_k_scores = sim_row[top_k_indices]
                
                # Store results
                all_stale_indices.extend([stale_idx] * top_k)
                all_host_indices.extend(top_k_indices)
                all_similarities.extend(top_k_scores)
            
            # Memory cleanup every 10 batches
            if (batch_idx + 1) % 10 == 0:
                gc.collect()
        
        print(f"‚úì Computed {len(all_similarities):,} similarity pairs")
        
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to compute similarities: {str(e)}'
        }
    
    # ========================================
    # Step 3: Create Results DataFrame
    # ========================================
    print("\n[3/4] Creating results dataframe...")
    
    try:
        results_df = pd.DataFrame({
            'stale_idx': all_stale_indices,
            'host_idx': all_host_indices,
            'similarity_score': all_similarities
        })
        
        print(f"‚úì Results shape: {results_df.shape}")
        print(f"‚úì Similarity score range: [{results_df['similarity_score'].min():.4f}, {results_df['similarity_score'].max():.4f}]")
        
        # Basic statistics
        print("\nSimilarity Score Distribution:")
        print(results_df['similarity_score'].describe())
        
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to create results dataframe: {str(e)}'
        }
    
    # ========================================
    # Step 4: Save Results
    # ========================================
    print("\n[4/4] Saving results...")
    
    try:
        output_file = os.path.join(output_folder, 'top_k_similarities.csv')
        results_df.to_csv(output_file, index=False)
        
        file_size_mb = os.path.getsize(output_file) / 1024**2
        print(f"‚úì Saved: {output_file} ({file_size_mb:.2f} MB)")
        
        # Save similarity computation metadata
        similarity_metadata = {
            "top_k": top_k,
            "batch_size": batch_size,
            "num_stale_pages": int(num_stale),
            "num_host_pages": int(len(host_embeddings)),
            "total_pairs": int(len(results_df)),
            "similarity_range": {
                "min": float(results_df['similarity_score'].min()),
                "max": float(results_df['similarity_score'].max()),
                "mean": float(results_df['similarity_score'].mean()),
                "median": float(results_df['similarity_score'].median())
            },
            "input_files": {
                "host_embeddings": host_emb_path,
                "stale_embeddings": stale_emb_path
            },
            "output_file": output_file
        }
        
        metadata_file = os.path.join(output_folder, 'similarity_metadata.json')
        with open(metadata_file, 'w') as f:
            json.dump(similarity_metadata, f, indent=2)
        
        print(f"‚úì Saved: {metadata_file}")
        
        # Sample output
        print("\nSample results (first 5 rows):")
        print(results_df.head())
        
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to save results: {str(e)}'
        }
    
    print("\n" + "=" * 60)
    print("Phase 4.2 Complete - Similarity Matrix Summary")
    print("=" * 60)
    print(f"‚úÖ Total similarity pairs: {len(results_df):,}")
    print(f"‚úÖ Top-{top_k} matches per stale page")
    print(f"‚úÖ Average similarity: {results_df['similarity_score'].mean():.4f}")
    print(f"‚úÖ Output: {output_file}")
    print("\n‚úÖ Ready for Phase 4.3 (Scoring & Ranking)")
    print("=" * 60)
    
    return {
        'status': 'success',
        'similarity_pairs': len(results_df),
        'output_file': output_file,
        'avg_similarity': float(results_df['similarity_score'].mean()),
        'similarity_range': {
            'min': float(results_df['similarity_score'].min()),
            'max': float(results_df['similarity_score'].max()),
            'mean': float(results_df['similarity_score'].mean()),
            'median': float(results_df['similarity_score'].median())
        }
    }


In [None]:
def execute_phase4_3(phase3_folder, phase4_folder, output_folder, 
                     weight_relevance=0.55, weight_authority=0.25, 
                     weight_crawl=0.10, weight_urgency=0.10,
                     max_rows_per_file=500000):
    """
    Phase 4.3: Scoring, Ranking & Link Recommendations
    Generates final link recommendations with scoring and placement zones
    
    Parameters:
    -----------
    phase3_folder : str
        Path to Phase 3 output folder (contains host_pages.csv)
    phase4_folder : str
        Path to Phase 4.1/4.2 output folder (contains embeddings and similarities)
    output_folder : str
        Directory where Phase 4.3 deliverables will be saved
    weight_relevance : float
        Weight for relevance score (default: 0.55)
    weight_authority : float
        Weight for authority score (default: 0.25)
    weight_crawl : float
        Weight for crawl frequency (default: 0.10)
    weight_urgency : float
        Weight for urgency score (default: 0.10)
    max_rows_per_file : int
        Maximum rows per CSV file for splitting (default: 500000)
    
    Returns:
    --------
    dict
        {
            'status': 'success' or 'error',
            'total_recommendations': total link recommendations,
            'avg_score': average final score,
            'deliverables': list of generated files,
            'output_folder': path to output folder
        }
    """
    
    import pandas as pd
    import numpy as np
    import json
    import os
    from datetime import datetime
    from tqdm import tqdm
    import warnings
    warnings.filterwarnings('ignore')
    
    print("=" * 80)
    print("Phase 4.3: Scoring, Ranking & Link Recommendations")
    print("=" * 80)
    
    # Create output folder
    os.makedirs(output_folder, exist_ok=True)
    
    # Validate weights
    total_weight = weight_relevance + weight_authority + weight_crawl + weight_urgency
    if abs(total_weight - 1.0) > 0.01:
        print(f"‚ö†Ô∏è  Warning: Weights sum to {total_weight:.2f}, normalizing to 1.0...")
        norm_factor = 1.0 / total_weight
        weight_relevance *= norm_factor
        weight_authority *= norm_factor
        weight_crawl *= norm_factor
        weight_urgency *= norm_factor
    
    print(f"\n‚úì Configuration:")
    print(f"  Phase 3 folder: {phase3_folder}")
    print(f"  Phase 4 folder: {phase4_folder}")
    print(f"  Output folder: {output_folder}")
    print(f"  Weights: R={weight_relevance:.2f}, A={weight_authority:.2f}, C={weight_crawl:.2f}, U={weight_urgency:.2f}")
    print(f"  Max rows per file: {max_rows_per_file:,}")
    
    # Check required files
    required_files = {
        'similarities': os.path.join(phase4_folder, 'top_k_similarities.csv'),
        'stale_pages': os.path.join(phase4_folder, 'stale_pages_combined.csv'),
        'host_pages': os.path.join(phase3_folder, 'host_pages.csv')
    }
    
    for name, path in required_files.items():
        if not os.path.exists(path):
            return {
                'status': 'error',
                'error': f'{name} not found: {path}'
            }
    
    # ========================================
    # Step 1: Load Data
    # ========================================
    print("\n[1/6] Loading data...")
    
    try:
        similarities = pd.read_csv(required_files['similarities'])
        print(f"‚úì Loaded {len(similarities):,} similarity pairs")
        
        host_pages = pd.read_csv(required_files['host_pages'])
        print(f"‚úì Loaded {len(host_pages):,} host pages")
        
        stale_pages = pd.read_csv(required_files['stale_pages'])
        print(f"‚úì Loaded {len(stale_pages):,} stale pages")
        
        # Add index columns
        host_pages['host_idx'] = range(len(host_pages))
        stale_pages['stale_idx'] = range(len(stale_pages))
        
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to load data: {str(e)}'
        }
    
    # ========================================
    # Step 2: Normalize Scores
    # ========================================
    print("\n[2/6] Normalizing authority and crawl metrics...")
    
    try:
        host_pages['authority_norm'] = host_pages['crawl_count'] / host_pages['crawl_count'].max()
        host_pages['crawl_norm'] = host_pages['crawl_frequency'] / host_pages['crawl_frequency'].max()
        
        print(f"‚úì Authority range: [{host_pages['authority_norm'].min():.4f}, {host_pages['authority_norm'].max():.4f}]")
        print(f"‚úì Crawl frequency range: [{host_pages['crawl_norm'].min():.4f}, {host_pages['crawl_norm'].max():.4f}]")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to normalize scores: {str(e)}'
        }
    
    # ========================================
    # Step 3: Calculate Urgency Score
    # ========================================
    print("\n[3/6] Calculating urgency scores...")
    
    def calculate_urgency(row):
        """Calculate urgency (0-1) based on status_code, page_type, days_since_modified"""
        urgency = 0.0
        
        status = row.get('status_code', 0)
        if status in [404, 410]:
            urgency += 0.4
        elif status in [301, 302, 500, 502, 503]:
            urgency += 0.3
        elif status == 200:
            urgency += 0.05
        elif status == 0:
            urgency += 0.35
        
        if row.get('page_type') == 'Orphan':
            urgency += 0.4
        elif row.get('crawl_count', 0) <= 1:
            urgency += 0.2
        
        days = row.get('days_since_modified', 0)
        if days > 730:
            urgency += 0.2
        elif days > 365:
            urgency += 0.15
        elif days > 180:
            urgency += 0.1
        else:
            urgency += 0.05
        
        return min(urgency, 1.0)
    
    try:
        stale_pages['urgency_score'] = stale_pages.apply(calculate_urgency, axis=1)
        print(f"‚úì Urgency range: [{stale_pages['urgency_score'].min():.4f}, {stale_pages['urgency_score'].max():.4f}]")
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to calculate urgency: {str(e)}'
        }
    
    # ========================================
    # Step 4: Merge & Score
    # ========================================
    print("\n[4/6] Computing final scores...")
    
    try:
        print("  ‚îú‚îÄ Merging with host pages...")
        results = similarities.merge(
            host_pages[['host_idx', 'url', 'authority_norm', 'crawl_norm', 'status_code']],
            left_on='host_idx',
            right_on='host_idx',
            how='left'
        )
        results = results.rename(columns={
            'url': 'host_url',
            'status_code': 'host_status_code'
        })
        
        print("  ‚îú‚îÄ Merging with stale pages...")
        results = results.merge(
            stale_pages[['stale_idx', 'url', 'urgency_score', 'page_type', 'status_code', 'days_since_modified']],
            left_on='stale_idx',
            right_on='stale_idx',
            how='left'
        )
        results = results.rename(columns={
            'url': 'stale_url',
            'status_code': 'stale_status_code'
        })
        
        print("  ‚îî‚îÄ Calculating scores...")
        
        results['relevance_score'] = results['similarity_score']
        results['authority_score'] = results['authority_norm']
        results['crawl_weight_score'] = results['crawl_norm']
        
        results['final_score'] = (
            results['relevance_score'] * weight_relevance +
            results['authority_score'] * weight_authority +
            results['crawl_weight_score'] * weight_crawl +
            results['urgency_score'] * weight_urgency
        ) * 100
        
        results['priority_score'] = (
            results['relevance_score'] * 0.70 +
            results['authority_score'] * 0.30
        ) * 100
        
        print(f"‚úì Final score range: [{results['final_score'].min():.2f}, {results['final_score'].max():.2f}]")
        
    except Exception as e:
        return {
            'status': 'error',
            'error': f'Failed to compute scores: {str(e)}'
        }
    
    # ========================================
    # Step 5: Assign Placement Zones
    # ========================================
    print("\n[5/6] Assigning placement zones...")
    
    def assign_placement_zone(relevance):
        if relevance >= 0.8:
            return 'content_body'
        elif relevance >= 0.6:
            return 'sidebar'
        else:
            return 'footer'
    
    results['placement_zone'] = results['relevance_score'].apply(assign_placement_zone)
    
    print("  Placement zone distribution:")
    print(results['placement_zone'].value_counts())
    
    # ========================================
    # Step 6: Generate Anchor Text
    # ========================================
    print("\n[6/6] Generating anchor text...")
    
    def extract_anchor_text(url):
        try:
            from urllib.parse import urlparse
            path = urlparse(url).path.strip('/')
            path = path.replace('.html', '').replace('.php', '')
            text = path.replace('/', ' ').replace('-', ' ').replace('_', ' ')
            text = ' '.join(text.split())
            return text[:100] if text else 'related content'
        except:
            return 'related content'
    
    results['anchor_text'] = results['stale_url'].apply(extract_anchor_text)
    
    # ========================================
    # Save Deliverables
    # ========================================
    deliverables = []
    
    # DELIVERABLE 1: Stale Page Reasons
    print("\nüíæ Saving Deliverable 1: stale_page_reasons.csv...")
    
    try:
        def categorize_reason(row):
            if row['status_code'] in [404, 410]:
                return 'GHOST'
            elif row['page_type'] == 'Orphan':
                return 'ORPHAN'
            elif row['days_since_modified'] > 90:
                return 'DECAY'
            else:
                return 'LOW_ACTIVITY'
        
        stale_reasons = stale_pages.copy()
        stale_reasons['reason_category'] = stale_reasons.apply(categorize_reason, axis=1)
        stale_reasons['days_since_crawl'] = stale_reasons['days_since_modified']
        stale_reasons['action_taken'] = 'Linked'
        
        link_counts = results.groupby('stale_idx').size().reset_index(name='recommended_links_count')
        stale_reasons = stale_reasons.merge(link_counts, on='stale_idx', how='left')
        stale_reasons['recommended_links_count'] = stale_reasons['recommended_links_count'].fillna(0).astype(int)
        
        stale_reasons = stale_reasons.rename(columns={'url': 'stale_url', 'status_code': 'stale_status_code'})
        
        stale_reasons_output = stale_reasons[['stale_url', 'stale_status_code', 'days_since_crawl', 
                                              'days_since_modified', 'reason_category', 'action_taken', 
                                              'recommended_links_count']]
        
        if len(stale_reasons_output) <= max_rows_per_file:
            output_path = os.path.join(output_folder, 'stale_page_reasons.csv')
            stale_reasons_output.to_csv(output_path, index=False)
            deliverables.append('stale_page_reasons.csv')
            print(f"‚úì Saved: stale_page_reasons.csv ({len(stale_reasons_output):,} rows)")
        else:
            num_parts = (len(stale_reasons_output) + max_rows_per_file - 1) // max_rows_per_file
            for i in range(num_parts):
                start_idx = i * max_rows_per_file
                end_idx = min(start_idx + max_rows_per_file, len(stale_reasons_output))
                part_df = stale_reasons_output.iloc[start_idx:end_idx]
                output_path = os.path.join(output_folder, f'stale_page_reasons_part{i+1}.csv')
                part_df.to_csv(output_path, index=False)
                deliverables.append(f'stale_page_reasons_part{i+1}.csv')
            print(f"‚úì Saved {num_parts} files: stale_page_reasons_part*.csv")
    
    except Exception as e:
        print(f"‚ö†Ô∏è Warning: Failed to save stale_page_reasons: {str(e)}")
    
    # DELIVERABLE 2: Master Link Graph
    print("\nüíæ Saving Deliverable 2: master_link_graph (CSV + Parquet)...")
    
    try:
        master_cols = [
            'host_url', 'stale_url', 'anchor_text', 'placement_zone',
            'final_score', 'priority_score', 'relevance_score', 'authority_score',
            'stale_status_code', 'host_status_code'
        ]
        master_graph = results[master_cols].sort_values('final_score', ascending=False)
        
        if len(master_graph) <= max_rows_per_file:
            csv_path = os.path.join(output_folder, 'master_link_graph.csv')
            master_graph.to_csv(csv_path, index=False)
            deliverables.append('master_link_graph.csv')
            print(f"‚úì Saved: master_link_graph.csv ({len(master_graph):,} rows)")
        else:
            num_parts = (len(master_graph) + max_rows_per_file - 1) // max_rows_per_file
            for i in range(num_parts):
                start_idx = i * max_rows_per_file
                end_idx = min(start_idx + max_rows_per_file, len(master_graph))
                part_df = master_graph.iloc[start_idx:end_idx]
                csv_path = os.path.join(output_folder, f'master_link_graph_part{i+1}.csv')
                part_df.to_csv(csv_path, index=False)
                deliverables.append(f'master_link_graph_part{i+1}.csv')
            print(f"‚úì Saved {num_parts} files: master_link_graph_part*.csv")
        
        parquet_path = os.path.join(output_folder, 'master_link_graph.parquet')
        master_graph.to_parquet(parquet_path, index=False, compression='snappy')
        parquet_size = os.path.getsize(parquet_path) / 1024**2
        deliverables.append('master_link_graph.parquet')
        print(f"‚úì Saved: master_link_graph.parquet ({parquet_size:.2f} MB)")
    
    except Exception as e:
        print(f"‚ö†Ô∏è Warning: Failed to save master_link_graph: {str(e)}")
    
    # DELIVERABLE 3: Link Injection Manifest
    print("\nüíæ Saving Deliverable 3: link_injection_manifest.json...")
    
    try:
        manifest_data = {
            "metadata": {
                "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "total_host_pages": int(len(host_pages)),
                "total_stale_pages": int(len(stale_pages)),
                "total_links": int(len(results)),
                "weights": {
                    "relevance": float(weight_relevance),
                    "authority": float(weight_authority),
                    "crawl": float(weight_crawl),
                    "urgency": float(weight_urgency)
                }
            },
            "host_pages": []
        }
        
        print("  Grouping links by host page...")
        grouped = results.groupby('host_url')
        
        stale_lookup = stale_pages.set_index('url')[['status_code', 'page_type', 'days_since_modified']].to_dict('index')
        
        for host_url, group in tqdm(grouped, desc="  Building manifest"):
            host_row = host_pages[host_pages['url'] == host_url].iloc[0]
            
            host_entry = {
                "host_url": host_url,
                "host_crawl_count": int(host_row['crawl_count']),
                "host_status_code": int(host_row['status_code']),
                "links": []
            }
            
            for _, link in group.nlargest(10, 'final_score').iterrows():
                stale_info = stale_lookup.get(link['stale_url'], {})
                
                status = stale_info.get('status_code', 0)
                page_type = stale_info.get('page_type', 'Unknown')
                days = stale_info.get('days_since_modified', 0)
                
                if status in [404, 410]:
                    reason = 'GHOST'
                elif page_type == 'Orphan':
                    reason = 'ORPHAN'
                elif days > 90:
                    reason = 'DECAY'
                else:
                    reason = 'LOW_ACTIVITY'
                
                host_entry["links"].append({
                    "target_url": link['stale_url'],
                    "anchor_text": link['anchor_text'],
                    "placement_zone": link['placement_zone'],
                    "final_score": round(float(link['final_score']), 2),
                    "relevance": round(float(link['relevance_score']), 4),
                    "stale_reason": reason,
                    "stale_status_code": int(link['stale_status_code'])
                })
            
            manifest_data["host_pages"].append(host_entry)
        
        manifest_path = os.path.join(output_folder, 'link_injection_manifest.json')
        with open(manifest_path, 'w') as f:
            json.dump(manifest_data, f, indent=2)
        
        manifest_size = os.path.getsize(manifest_path) / 1024**2
        deliverables.append('link_injection_manifest.json')
        print(f"‚úì Saved: link_injection_manifest.json ({manifest_size:.2f} MB)")
    
    except Exception as e:
        print(f"‚ö†Ô∏è Warning: Failed to save manifest: {str(e)}")
    
    # DELIVERABLE 4: Scoring Metadata
    print("\nüíæ Saving Deliverable 4: scoring_metadata.json...")
    
    try:
        metadata = {
            "weights": {
                "relevance": float(weight_relevance),
                "authority": float(weight_authority),
                "crawl": float(weight_crawl),
                "urgency": float(weight_urgency)
            },
            "urgency_components": {
                "status_code_weight": 0.4,
                "page_type_weight": 0.4,
                "crawl_decay_weight": 0.2
            },
            "statistics": {
                "total_recommendations": int(len(results)),
                "avg_final_score": float(results['final_score'].mean()),
                "high_confidence_links": int((results['final_score'] > 70).sum()),
                "placement_zones": {
                    "content_body": int((results['placement_zone'] == 'content_body').sum()),
                    "sidebar": int((results['placement_zone'] == 'sidebar').sum()),
                    "footer": int((results['placement_zone'] == 'footer').sum())
                }
            },
            "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        metadata_path = os.path.join(output_folder, 'scoring_metadata.json')
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        deliverables.append('scoring_metadata.json')
        print(f"‚úì Saved: scoring_metadata.json")
    
    except Exception as e:
        print(f"‚ö†Ô∏è Warning: Failed to save metadata: {str(e)}")
    
    # Summary
    print("\n" + "=" * 80)
    print("Phase 4.3 Complete - All Deliverables Generated")
    print("=" * 80)
    print(f"\nüìä Summary Statistics:")
    print(f"  Total link recommendations: {len(results):,}")
    print(f"  Average final score: {results['final_score'].mean():.2f}")
    print(f"  High confidence links (>70): {(results['final_score'] > 70).sum():,}")
    print(f"\nüìÅ Output Location: {output_folder}")
    print("\n‚úÖ All deliverables ready for deployment!")
    print("=" * 80)
    
    return {
        'status': 'success',
        'total_recommendations': int(len(results)),
        'avg_score': float(results['final_score'].mean()),
        'high_confidence': int((results['final_score'] > 70).sum()),
        'deliverables': deliverables,
        'output_folder': os.path.abspath(output_folder)
    }


In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
import os

# ============================================
# Global State
# ============================================
global_state = {
    'log_files': [],
    'host_pages_df': None,
    'stale_pages_df': None,
    'embeddings_ready': False
}

# Output widgets for each phase
phase1_output = widgets.Output()
phase2_output = widgets.Output()
phase3_2_output = widgets.Output()
phase3_4_output = widgets.Output()
phase4_1_output = widgets.Output()
phase4_2_output = widgets.Output()
phase4_3_output = widgets.Output()

# ============================================
# Configuration Widgets
# ============================================
config_title = widgets.HTML("<h2>‚öôÔ∏è Project Configuration</h2>")

log_folder_path = widgets.Text(
    value='D:\\Task-3 complete\\pre-process files phase 1',
    description='Log Folder:',
    style={'description_width': '140px'},
    layout=widgets.Layout(width='550px')
)

sitemap_url = widgets.Textarea(
    value='''https://www.alamy.com/sitemaps/image_daily_index_s_100000001_110000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_10000001_20000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_110000001_120000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_120000001_130000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_130000001_140000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_140000001_150000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_150000001_160000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_160000001_170000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_170000001_180000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_180000001_190000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_200000001_210000000.xml
https://www.alamy.com/sitemaps/image_daily_index_s_210000001_220000000.xml''',
    description='Sitemap URLs:',
    placeholder='Enter sitemap URLs (one per line or comma-separated)',
    style={'description_width': '140px'},
    layout=widgets.Layout(width='550px', height='120px')
)

phase3_output_folder = widgets.Text(
    value='D:\\Task-3 complete\\phase 3 host&stale pages output',
    description='Phase 3 Output:',
    style={'description_width': '140px'},
    layout=widgets.Layout(width='550px')
)

phase4_output_folder = widgets.Text(
    value='D:\\Task-3 complete\\Phase 4 output_folder analysis',
    description='Phase 4.1-4.2 Output:',  # ‚Üê CHANGED description
    style={'description_width': '140px'},
    layout=widgets.Layout(width='550px')
)

# NEW: Phase 4.3 deliverables output folder
phase4_3_output_folder = widgets.Text(
    value='D:\Task-3 complete\phase4_final 2',
    description='Phase 4.3 Output:',
    style={'description_width': '140px'},
    layout=widgets.Layout(width='550px')
)

# Phase 3.2 Config
crawl_days_threshold = widgets.IntText(
    value=10,
    description='Crawl Freq (days):',
    style={'description_width': '140px'}
)

# Phase 3.4 Config
stale_days_threshold = widgets.IntText(
    value=30,
    description='Stale Threshold:',
    style={'description_width': '140px'}
)

low_activity_percentile = widgets.FloatSlider(
    value=10.0,
    min=5.0,
    max=20.0,
    step=1.0,
    description='Low Activity %:',
    style={'description_width': '140px'}
)

# Phase 4 Config
model_choice = widgets.Dropdown(
    options=['all-MiniLM-L6-v2', 'all-mpnet-base-v2'],
    value='all-MiniLM-L6-v2',
    description='Model:',
    style={'description_width': '140px'}
)

batch_size_embeddings = widgets.IntText(
    value=32,
    description='Embedding Batch:',
    style={'description_width': '140px'}
)

top_k = widgets.IntText(
    value=10,
    description='Top-K:',
    style={'description_width': '140px'}
)

batch_size_similarity = widgets.IntText(
    value=1000,
    description='Similarity Batch:',
    style={'description_width': '140px'}
)

weight_relevance = widgets.FloatSlider(
    value=0.60,
    min=0.0,
    max=1.0,
    step=0.05,
    description='W Relevance:',
    style={'description_width': '140px'}
)

weight_authority = widgets.FloatSlider(
    value=0.25,
    min=0.0,
    max=1.0,
    step=0.05,
    description='W Authority:',
    style={'description_width': '140px'}
)

weight_urgency = widgets.FloatSlider(
    value=0.15,
    min=0.0,
    max=1.0,
    step=0.05,
    description='W Urgency:',
    style={'description_width': '140px'}
)

# ============================================
# Phase Buttons
# ============================================
btn_phase1 = widgets.Button(
    description="Phase 1: Data Loading",
    button_style='info',
    icon='database',
    layout=widgets.Layout(width='350px', height='45px')
)

btn_phase2 = widgets.Button(
    description="Phase 2: Log Verification",
    button_style='success',
    icon='check',
    layout=widgets.Layout(width='350px', height='45px')
)

btn_phase3_2 = widgets.Button(
    description="Phase 3.2: Host Page Counting",
    button_style='warning',
    icon='chart-bar',
    layout=widgets.Layout(width='350px', height='45px')
)

btn_phase3_4 = widgets.Button(
    description="Phase 3.4: Stale Page Finder",
    button_style='warning',
    icon='exclamation-triangle',
    layout=widgets.Layout(width='350px', height='45px')
)

btn_phase4_1 = widgets.Button(
    description="Phase 4.1: Generate Embeddings",
    button_style='primary',
    icon='brain',
    layout=widgets.Layout(width='350px', height='45px')
)

btn_phase4_2 = widgets.Button(
    description="Phase 4.2: Compute Similarities",
    button_style='primary',
    icon='calculator',
    layout=widgets.Layout(width='350px', height='45px')
)

btn_phase4_3 = widgets.Button(
    description="Phase 4.3: Score & Rank Links",
    button_style='primary',
    icon='trophy',
    layout=widgets.Layout(width='350px', height='45px')
)

# ============================================
# Event Handlers - CALL YOUR FUNCTIONS
# ============================================

def run_phase1(b):
    """Phase 1: Validate data sources"""
    with phase1_output:
        clear_output(wait=True)
        print("üîÑ Phase 1: Validating data sources...")
        
        try:
            if not os.path.exists(log_folder_path.value):
                print(f"‚ùå Log folder not found: {log_folder_path.value}")
                return
            
            log_files = [f for f in os.listdir(log_folder_path.value) if f.endswith('.csv') or f.endswith('.log') or f.endswith('.log.gz')]
            global_state['log_files'] = log_files
            print(f"‚úÖ Found {len(log_files)} log files")
            
            if sitemap_url.value.startswith('http'):
                print(f"‚úÖ Sitemap URL: {sitemap_url.value.split()[0]}")  # Show first URL
            
            print("‚úÖ Phase 1 Complete")
            btn_phase2.disabled = False
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")


def run_phase2(b):
    """Phase 2: Log file verification and processing"""
    with phase2_output:
        clear_output(wait=True)
        print("üîÑ Phase 2: Verifying and processing log files...")
        
        try:
            results = execute_phase2(
                input_folder=log_folder_path.value,
                output_folder=phase3_output_folder.value
            )
            
            if results and results.get('status') == 'success':
                print(f"\n‚úÖ Phase 2 Complete!")
                print(f"   Final records: {results.get('final_records', 0):,}")
                print(f"   Output: {results.get('output_folder', '')}")
                btn_phase3_2.disabled = False
            else:
                print(f"‚ùå Phase 2 failed: {results.get('error', 'Unknown error')}")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")


def run_phase3_2(b):
    """Phase 3.2: Host page counting"""
    with phase3_2_output:
        clear_output(wait=True)
        print("üîÑ Phase 3.2: Counting host pages...")
        print(f"   Crawl frequency threshold: {crawl_days_threshold.value} days")
        
        try:
            results = execute_phase3_2(
                input_folder=phase3_output_folder.value,
                output_folder=phase3_output_folder.value,
                crawl_threshold=crawl_days_threshold.value
            )
            
            if results and results.get('status') == 'success':
                host_count = results.get('host_pages', 0)
                print(f"‚úÖ Phase 3.2 Complete!")
                print(f"   Host pages identified: {host_count:,}")
                print(f"   Output: {results.get('output_file', '')}")
                global_state['host_pages_df'] = results.get('dataframe')
                btn_phase3_4.disabled = False
            else:
                print(f"‚ùå Phase 3.2 failed: {results.get('error', 'Unknown error')}")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            

def run_phase3_4(b):
    """Phase 3.4: Stale page finder (Party Bouncer)"""
    with phase3_4_output:
        clear_output(wait=True)
        print("üîÑ Phase 3.4: Detecting stale pages (Party Bouncer)...")
        print(f"   Input: {phase3_output_folder.value}")
        print(f"   Output: {phase3_output_folder.value}")
        print(f"   Sitemap URLs: {len(sitemap_url.value.strip().split())} indexes")
        print(f"   Stale threshold: {stale_days_threshold.value} days")
        print(f"   Low activity percentile: {low_activity_percentile.value}%")
        
        try:
            results = execute_phase3_4(
                input_folder=phase3_output_folder.value,
                output_folder=phase3_output_folder.value,
                sitemap_urls=sitemap_url.value,
                stale_threshold=stale_days_threshold.value,
                percentile=low_activity_percentile.value,
                workers=10,
                log_batch_size=5,
                test_mode=False
            )
            
            if results and results.get('status') == 'success':
                print(f"\nüìä Stale Page Summary:")
                print(f"   Orphans: {results.get('orphans', 0):,}")
                print(f"   Low Activity: {results.get('low_activity', 0):,}")
                print(f"   Total Stale Pages: {results.get('total_stale', 0):,}")
                print(f"   Output files: {len(results.get('output_files', []))}")
                print(f"\n‚úÖ Phase 3.4 Complete!")
                
                global_state['stale_pages_df'] = results.get('dataframe')
                btn_phase4_1.disabled = False
            else:
                print(f"‚ùå Phase 3.4 failed: {results.get('error', 'Unknown error')}")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            import traceback
            traceback.print_exc()


def run_phase4_1(b):
    """Phase 4.1: Generate embeddings"""
    with phase4_1_output:
        clear_output(wait=True)
        print("üîÑ Phase 4.1: Generating embeddings...")
        print(f"   Input: {phase3_output_folder.value}")
        print(f"   Output: {phase4_output_folder.value}")
        print(f"   Model: {model_choice.value}")
        print(f"   Batch size: {batch_size_embeddings.value}")
        
        try:
            results = execute_phase4_1(
                input_folder=phase3_output_folder.value,
                output_folder=phase4_output_folder.value,
                model_name=model_choice.value,
                batch_size=batch_size_embeddings.value
            )
            
            if results and results.get('status') == 'success':
                print(f"\n‚úÖ Phase 4.1 Complete!")
                print(f"   Host embeddings: {results.get('host_count', 0):,}")
                print(f"   Stale embeddings: {results.get('stale_count', 0):,}")
                print(f"   Embedding dimensions: {results.get('embedding_dimensions', 0)}")
                print(f"   Output: {results.get('output_folder', '')}")
                
                global_state['embeddings_ready'] = True
                btn_phase4_2.disabled = False
            else:
                print(f"‚ùå Phase 4.1 failed: {results.get('error', 'Unknown error')}")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            import traceback
            traceback.print_exc()


def run_phase4_2(b):
    """Phase 4.2: Compute similarities"""
    with phase4_2_output:
        clear_output(wait=True)
        print("üîÑ Phase 4.2: Computing similarities...")
        print(f"   Input: {phase4_output_folder.value}")
        print(f"   Output: {phase4_output_folder.value}")
        print(f"   Top-K: {top_k.value}")
        print(f"   Batch size: {batch_size_similarity.value}")
        
        try:
            results = execute_phase4_2(
                input_folder=phase4_output_folder.value,
                output_folder=phase4_output_folder.value,
                top_k=top_k.value,
                batch_size=batch_size_similarity.value
            )
            
            if results and results.get('status') == 'success':
                print(f"\n‚úÖ Phase 4.2 Complete!")
                print(f"   Similarity pairs: {results.get('similarity_pairs', 0):,}")
                print(f"   Average similarity: {results.get('avg_similarity', 0):.4f}")
                
                sim_range = results.get('similarity_range', {})
                print(f"   Similarity range: [{sim_range.get('min', 0):.4f}, {sim_range.get('max', 0):.4f}]")
                print(f"   Output: {results.get('output_file', '')}")
                
                btn_phase4_3.disabled = False
            else:
                print(f"‚ùå Phase 4.2 failed: {results.get('error', 'Unknown error')}")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            import traceback
            traceback.print_exc()


def run_phase4_3(b):
    """Phase 4.3: Score, rank and generate recommendations"""
    with phase4_3_output:
        clear_output(wait=True)
        print("üîÑ Phase 4.3: Scoring and ranking...")
        print(f"   Input: Phase 3 ({phase3_output_folder.value})")
        print(f"          Phase 4 ({phase4_output_folder.value})")
        print(f"   Output: {phase4_3_output_folder.value}")  # ‚Üê UPDATED
        
        # Validate and normalize weights
        total = weight_relevance.value + weight_authority.value + weight_urgency.value
        if abs(total - 1.0) > 0.01:
            print(f"‚ö†Ô∏è  Warning: Weights sum to {total:.2f}, normalizing...")
            norm_factor = 1.0 / total
            w_rel = weight_relevance.value * norm_factor
            w_auth = weight_authority.value * norm_factor
            w_urg = weight_urgency.value * norm_factor
        else:
            w_rel = weight_relevance.value
            w_auth = weight_authority.value
            w_urg = weight_urgency.value
        
        w_crawl = 1.0 - (w_rel + w_auth + w_urg)
        
        print(f"   Weights: Relevance={w_rel:.2f}, Authority={w_auth:.2f}, Crawl={w_crawl:.2f}, Urgency={w_urg:.2f}")
        
        try:
            # Call Phase 4.3 wrapper function
            results = execute_phase4_3(
                phase3_folder=phase3_output_folder.value,
                phase4_folder=phase4_output_folder.value,
                output_folder=phase4_3_output_folder.value,  # ‚Üê UPDATED to use separate folder
                weight_relevance=w_rel,
                weight_authority=w_auth,
                weight_crawl=w_crawl,
                weight_urgency=w_urg
            )
            
            if results and results.get('status') == 'success':
                print(f"\nüìä Deliverables Generated:")
                for deliverable in results.get('deliverables', []):
                    print(f"   ‚úì {deliverable}")
                
                print(f"\n‚úÖ Phase 4.3 Complete!")
                print(f"   Total recommendations: {results.get('total_recommendations', 0):,}")
                print(f"   Average score: {results.get('avg_score', 0):.2f}")
                print(f"   High confidence (>70): {results.get('high_confidence', 0):,}")
                print(f"   Output: {results.get('output_folder', '')}")
            else:
                print(f"‚ùå Phase 4.3 failed: {results.get('error', 'Unknown error')}")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            import traceback
            traceback.print_exc()


# ============================================
# Attach Event Handlers
# ============================================
btn_phase1.on_click(run_phase1)
btn_phase2.on_click(run_phase2)
btn_phase3_2.on_click(run_phase3_2)
btn_phase3_4.on_click(run_phase3_4)
btn_phase4_1.on_click(run_phase4_1)
btn_phase4_2.on_click(run_phase4_2)
btn_phase4_3.on_click(run_phase4_3)

# ============================================
# Dashboard Layout
# ============================================
config_section = widgets.VBox([
    config_title,
    widgets.HTML("<h4>üìÇ Data Sources</h4>"),
    log_folder_path,
    sitemap_url,
    widgets.HTML("<h4>üìÅ Output Folders</h4>"),
    phase3_output_folder,
    phase4_output_folder,
    phase4_3_output_folder,  # ‚Üê ADDED
    widgets.HTML("<h4>Phase 3.2 Config</h4>"),
    crawl_days_threshold,
    widgets.HTML("<h4>Phase 3.4 Config</h4>"),
    stale_days_threshold,
    low_activity_percentile,
    widgets.HTML("<h4>Phase 4 Config</h4>"),
    model_choice,
    batch_size_embeddings,
    top_k,
    batch_size_similarity,
    weight_relevance,
    weight_authority,
    weight_urgency
], layout=widgets.Layout(padding='15px', width='600px'))

phases_section = widgets.VBox([
    widgets.HTML("<h1>üîó Internal Linking Analysis Pipeline</h1>"),
    
    widgets.HTML("<h3>Phase 1-2: Data Preparation</h3>"),
    btn_phase1,
    phase1_output,
    btn_phase2,
    phase2_output,
    
    widgets.HTML("<h3>Phase 3: Host & Stale Page Identification</h3>"),
    btn_phase3_2,
    phase3_2_output,
    btn_phase3_4,
    phase3_4_output,
    
    widgets.HTML("<h3>Phase 4: Link Strategy Generation</h3>"),
    btn_phase4_1,
    phase4_1_output,
    btn_phase4_2,
    phase4_2_output,
    btn_phase4_3,
    phase4_3_output
], layout=widgets.Layout(padding='15px', width='750px'))

dashboard = widgets.HBox([config_section, phases_section])
display(dashboard)
