In [1]:
import subprocess
import os
import shutil
from pathlib import Path

def clear_conda_cache():
    """Comprehensive Conda cache cleanup"""
    
    print("=" * 60)
    print("üßπ CONDA CACHE CLEANUP")
    print("=" * 60)
    
    # 1. Run conda clean --all
    print("\n1Ô∏è‚É£ Running 'conda clean --all'...")
    try:
        result = subprocess.run(['conda', 'clean', '--all', '-y'], 
                              capture_output=True, text=True)
        print(result.stdout)
        print("‚úÖ Conda clean completed")
    except Exception as e:
        print(f"‚ö†Ô∏è Error running conda clean: {e}")
    
    # 2. Find and show conda directories
    print("\n2Ô∏è‚É£ Locating Conda directories...")
    
    try:
        # Get conda info
        result = subprocess.run(['conda', 'info', '--base'], 
                              capture_output=True, text=True)
        conda_base = result.stdout.strip()
        print(f"   Conda base: {conda_base}")
        
        # Common cache locations
        cache_locations = [
            os.path.join(conda_base, 'pkgs'),
            os.path.join(conda_base, 'conda-meta'),
            os.path.expanduser('~/.conda/pkgs'),
            os.path.expanduser('~/anaconda3/pkgs'),
            os.path.expanduser('~/miniconda3/pkgs'),
        ]
        
        print("\n3Ô∏è‚É£ Cache locations:")
        for location in cache_locations:
            if os.path.exists(location):
                size = sum(f.stat().st_size for f in Path(location).rglob('*') if f.is_file())
                size_mb = size / (1024**2)
                print(f"   üìÅ {location} ({size_mb:.2f} MB)")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error: {e}")
    
    # 3. Clear pip cache
    print("\n4Ô∏è‚É£ Clearing pip cache...")
    try:
        subprocess.run(['pip', 'cache', 'purge'], capture_output=True)
        print("‚úÖ Pip cache cleared")
    except Exception as e:
        print(f"‚ö†Ô∏è Error: {e}")
    
    print("\n" + "=" * 60)
    print("‚úÖ CLEANUP COMPLETE!")
    print("=" * 60)

if __name__ == "__main__":
    clear_conda_cache()


üßπ CONDA CACHE CLEANUP

1Ô∏è‚É£ Running 'conda clean --all'...
There are no unused tarball(s) to remove.
There are no index cache(s) to remove.
There are no unused package(s) to remove.
There are no tempfile(s) to remove.
There are no logfile(s) to remove.

‚úÖ Conda clean completed

2Ô∏è‚É£ Locating Conda directories...
   Conda base: D:\anaconda

3Ô∏è‚É£ Cache locations:
   üìÅ D:\anaconda\pkgs (4639.96 MB)
   üìÅ D:\anaconda\conda-meta (11.18 MB)

4Ô∏è‚É£ Clearing pip cache...
‚úÖ Pip cache cleared

‚úÖ CLEANUP COMPLETE!


In [2]:
# Delete the cache file and re-run
import os
if os.path.exists('google_ip_ranges_cache.json'):
    os.remove('google_ip_ranges_cache.json')


In [3]:
# Install required libraries in Jupyter
!pip install requests dnspython tqdm




In [None]:
import os
import json
import csv
import gzip
import socket
import time
import zipfile
import ipaddress
import requests
import dns.resolver
import dns.reversename
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

# ============================================================================
# CONFIGURATION
# ============================================================================

MAX_DNS_WORKERS = 150
DNS_TIMEOUT = 2
IP_RANGES_CACHE_FILE = "google_ip_ranges_cache.json"
CACHE_EXPIRY_HOURS = 24
GOOGLE_DOMAINS = ['.googlebot.com', '.google.com', '.googleusercontent.com']
MAX_RECORDS_PER_FILE = 500000

# NEW: Lenient mode and FcrDNS fallback
LENIENT_MODE = True  # Accept records with UA but missing/invalid IP
ENABLE_FCRDNS_FALLBACK = True  # Verify non-CIDR IPs with FcrDNS

GOOGLE_IP_RANGE_URLS = {
    'googlebot': 'https://developers.google.com/static/search/apis/ipranges/googlebot.json',
    'special_crawlers': 'https://developers.google.com/static/search/apis/ipranges/special-crawlers.json',
    'user_triggered_fetchers': 'https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json',
    'user_triggered_fetchers_google': 'https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers-google.json',
}

# ============================================================================
# FETCH DYNAMIC IP RANGES
# ============================================================================

def fetch_google_ip_ranges():
    """Fetch Google's official IP ranges (IPv4 + IPv6) from JSON files"""
    print("\nüåê Fetching Google IP ranges from official sources...")
    
    all_prefixes = []
    
    for source_name, url in GOOGLE_IP_RANGE_URLS.items():
        try:
            print(f"   ‚Ä¢ Fetching {source_name}...")
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            data = response.json()
            
            if 'prefixes' in data:
                for prefix in data['prefixes']:
                    if 'ipv4Prefix' in prefix:
                        all_prefixes.append(prefix['ipv4Prefix'])
                    if 'ipv6Prefix' in prefix:
                        all_prefixes.append(prefix['ipv6Prefix'])
            
            print(f"     ‚úÖ Retrieved {len(data.get('prefixes', []))} prefixes")
        
        except Exception as e:
            print(f"     ‚ö†Ô∏è Warning: Failed to fetch {source_name}: {str(e)}")
    
    ip_networks = []
    ipv4_count = 0
    ipv6_count = 0
    
    for prefix in all_prefixes:
        try:
            network = ipaddress.ip_network(prefix)
            ip_networks.append(network)
            
            if network.version == 4:
                ipv4_count += 1
            else:
                ipv6_count += 1
        except ValueError:
            continue
    
    print(f"\n‚úÖ Total IP ranges loaded: {len(ip_networks)} CIDR blocks")
    print(f"   ‚Ä¢ IPv4 ranges: {ipv4_count}")
    print(f"   ‚Ä¢ IPv6 ranges: {ipv6_count}")
    return ip_networks

def load_or_fetch_ip_ranges(cache_file=IP_RANGES_CACHE_FILE):
    """Load IP ranges from cache or fetch from Google if expired"""
    if os.path.exists(cache_file):
        try:
            with open(cache_file, 'r') as f:
                cache_data = json.load(f)
            
            cache_time = cache_data.get('timestamp', 0)
            cache_age_hours = (time.time() - cache_time) / 3600
            
            if cache_age_hours < CACHE_EXPIRY_HOURS:
                print(f"\n‚úÖ Using cached IP ranges (age: {cache_age_hours:.1f} hours)")
                ip_networks = [ipaddress.ip_network(cidr) for cidr in cache_data['prefixes']]
                
                ipv4_count = sum(1 for n in ip_networks if n.version == 4)
                ipv6_count = sum(1 for n in ip_networks if n.version == 6)
                print(f"   ‚Ä¢ IPv4 ranges: {ipv4_count}")
                print(f"   ‚Ä¢ IPv6 ranges: {ipv6_count}")
                return ip_networks
            else:
                print(f"\n‚è∞ Cache expired, fetching fresh data...")
        except Exception as e:
            print(f"\n‚ö†Ô∏è Cache error, fetching fresh data...")
    
    ip_networks = fetch_google_ip_ranges()
    
    try:
        cache_data = {
            'timestamp': time.time(),
            'prefixes': [str(network) for network in ip_networks]
        }
        with open(cache_file, 'w') as f:
            json.dump(cache_data, f, indent=2)
        print(f"üíæ IP ranges cached")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to save cache")
    
    return ip_networks

# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def extract_full_timestamp_from_filename(filename):
    """Extract unique timestamp from filename"""
    base_name = filename
    if base_name.endswith('.gz'):
        base_name = base_name[:-3]
    if base_name.endswith('.log'):
        base_name = base_name[:-4]
    if base_name.startswith('nginx-'):
        base_name = base_name[6:]
    return base_name if base_name else None

def extract_first_ip(ip_string):
    """Extract first IP (IPv4 or IPv6) from X-Forwarded-For field"""
    if not ip_string:
        return None
    
    ip = str(ip_string).split(',')[0].strip()
    
    if ip.startswith('[') and ip.endswith(']'):
        ip = ip[1:-1]
    
    return ip if ip else None

def is_googlebot(user_agent_str):
    """UA validation - Check if contains googlebot patterns"""
    if not user_agent_str:
        return False
    
    try:
        ua_lower = str(user_agent_str).lower()
    except:
        return False
    
    googlebot_patterns = [
        'googlebot', 'google-inspectiontool', 'googlebot-image', 'googlebot-news',
        'googlebot-video', 'adsbot-google', 'mediapartners-google', 'apis-google',
        'google favicon', 'feedfetcher-google', 'google-read-aloud', 'duplichecker',
        'google web preview', 'google-site-verification', 'google-smartphone'
    ]
    
    for pattern in googlebot_patterns:
        if pattern in ua_lower:
            return True
    return False

def is_search_api_url(url):
    """Check if URL contains search-api pattern"""
    if not url:
        return False
    return '/search-api/v1/search/' in str(url).lower()

def create_zip_archive(file_list, output_folder, archive_name="googlebot_data"):
    """Create ZIP archive"""
    zip_filename = os.path.join(output_folder, f"{archive_name}_{time.strftime('%Y%m%d_%H%M%S')}.zip")
    
    print(f"\nüì¶ Creating ZIP archive...")
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for file_path in tqdm(file_list, desc="Archiving", unit=" files"):
            zipf.write(file_path, arcname=os.path.basename(file_path))
    
    zip_size_mb = os.path.getsize(zip_filename) / (1024**2)
    print(f"‚úÖ ZIP created: {os.path.basename(zip_filename)} ({zip_size_mb:.2f} MB)")
    return zip_filename

# ============================================================================
# VERIFICATION METHODS
# ============================================================================

def verify_ip_in_range(ip, ip_networks):
    """Check if IP is in Google's CIDR ranges"""
    try:
        ip_addr = ipaddress.ip_address(ip)
        for network in ip_networks:
            if ip_addr in network:
                return True
        return False
    except ValueError:
        return False

def verify_ip_method2_dns_fast(ip):
    """FcrDNS Verification - ULTRA FAST using dnspython"""
    try:
        rev_name = dns.reversename.from_address(ip)
        
        resolver = dns.resolver.Resolver()
        resolver.timeout = DNS_TIMEOUT
        resolver.lifetime = DNS_TIMEOUT
        
        try:
            reverse_answers = resolver.resolve(rev_name, 'PTR')
            hostname = str(reverse_answers[0]).rstrip('.')
            hostname_lower = hostname.lower()
        except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer, dns.resolver.Timeout, dns.exception.DNSException):
            return False
        
        is_google_domain = any(hostname_lower.endswith(domain) for domain in GOOGLE_DOMAINS)
        
        if not is_google_domain:
            return False
        
        try:
            forward_ips = []
            
            try:
                forward_answers = resolver.resolve(hostname, 'A')
                forward_ips.extend([str(rdata) for rdata in forward_answers])
            except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
                pass
            
            try:
                forward_answers = resolver.resolve(hostname, 'AAAA')
                forward_ips.extend([str(rdata) for rdata in forward_answers])
            except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
                pass
            
            if not forward_ips:
                return False
            
            if ip in forward_ips:
                return True
            else:
                return False
        
        except (dns.resolver.Timeout, dns.exception.DNSException):
            return False
    
    except Exception:
        return False

def verify_ips_parallel_dns(ips_to_verify, desc="DNS Verification"):
    """Verify IPs using FcrDNS in parallel"""
    verified_ips = {}
    
    with ThreadPoolExecutor(max_workers=MAX_DNS_WORKERS) as executor:
        future_to_ip = {
            executor.submit(verify_ip_method2_dns_fast, ip): ip 
            for ip in ips_to_verify
        }
        
        with tqdm(total=len(ips_to_verify), desc=desc, unit=" IPs") as pbar:
            for future in as_completed(future_to_ip):
                ip = future_to_ip[future]
                try:
                    is_verified = future.result(timeout=DNS_TIMEOUT + 1)
                    verified_ips[ip] = is_verified
                except Exception:
                    verified_ips[ip] = False
                finally:
                    pbar.update(1)
    
    return verified_ips

# ============================================================================
# MAIN PROCESSING - IN-MEMORY 3-PHASE (NO INTERMEDIATE FILES)
# ============================================================================

def process_log_files_final(input_folder=None, input_file=None, output_folder="googlebot_final_only", 
                            max_records_per_file=500000, create_zip=True):
    """
    IN-MEMORY 3-PHASE GOOGLEBOT EXTRACTION
    Only outputs final Phase 3 files with search-API filtering
    """
    
    print("=" * 80)
    print("üéØ IN-MEMORY 3-PHASE GOOGLEBOT EXTRACTION")
    print("=" * 80)
    print(f"   Lenient Mode: {'ENABLED' if LENIENT_MODE else 'DISABLED'}")
    print(f"   FcrDNS Fallback: {'ENABLED' if ENABLE_FCRDNS_FALLBACK else 'DISABLED'}")
    print(f"   Search-API Filter: ENABLED")
    
    # Load IP ranges
    print(f"\n{'=' * 80}")
    print(f"LOADING GOOGLE IP RANGES")
    print(f"{'=' * 80}")
    
    try:
        google_ip_networks = load_or_fetch_ip_ranges()
        print(f"‚úÖ Ready with {len(google_ip_networks)} official IP ranges")
    except Exception as e:
        print(f"‚ùå Failed to load IP ranges: {str(e)}")
        return None
    
    # Get input files
    log_files = []
    if input_file:
        if os.path.exists(input_file):
            log_files = [input_file]
        else:
            print(f"‚ùå File not found: {input_file}")
            return None
    elif input_folder:
        if os.path.exists(input_folder):
            log_files = sorted([os.path.join(input_folder, f) for f in os.listdir(input_folder) 
                        if f.endswith('.log') or f.endswith('.log.gz')])
            if not log_files:
                print(f"‚ùå No .log files found in: {input_folder}")
                return None
        else:
            print(f"‚ùå Folder not found: {input_folder}")
            return None
    else:
        print("‚ùå Please provide input_folder or input_file")
        return None
    
    os.makedirs(output_folder, exist_ok=True)
    abs_output_folder = os.path.abspath(output_folder)
    
    print(f"\nüìÅ Input files: {len(log_files)}")
    
    print(f"\nüîç IN-MEMORY 3-PHASE STRATEGY:")
    print(f"   Phase 1: Filter by Googlebot UA ‚Üí Store in memory")
    print(f"   Phase 2: Check IPs in CIDR ‚Üí Store in memory")
    print(f"   Phase 3: FcrDNS verification ‚Üí Write final output (no intermediate files)")
    print(f"   Phase 4: Search-API filter ‚Üí Final clean dataset")
    
    # ========================================================================
    # PHASE 1: IN-MEMORY UA FILTERING
    # ========================================================================
    print(f"\n{'=' * 80}")
    print(f"PHASE 1: FILTER BY GOOGLEBOT UA (IN-MEMORY)")
    print(f"{'=' * 80}")
    
    googlebot_records = []  # Store all googlebot records in memory
    unique_ips_phase1 = set()
    googlebot_variants = {}
    
    stats_phase1 = {
        'total_processed': 0,
        'has_googlebot_ua': 0,
    }
    
    start_phase1 = time.time()
    
    for file_idx, log_file in enumerate(log_files, 1):
        filename = os.path.basename(log_file)
        file_size_mb = os.path.getsize(log_file) / (1024**2)
        print(f"\nüìÇ [{file_idx}/{len(log_files)}] {filename} ({file_size_mb:.1f} MB)")
        
        file_accepted = 0
        
        try:
            if log_file.endswith('.gz'):
                file_handle = gzip.open(log_file, 'rt', encoding='utf-8', errors='ignore')
            else:
                file_handle = open(log_file, 'r', encoding='utf-8', errors='ignore')
            
            with file_handle as log_reader:
                pbar = tqdm(log_reader, desc="   Filtering UA", unit=" recs", ncols=100, mininterval=0.5)
                
                for line in pbar:
                    try:
                        if not line.strip():
                            continue
                        
                        log_entry = json.loads(line.strip())
                        
                        stats_phase1['total_processed'] += 1
                        
                        user_agent = log_entry.get('http_user_agent', '')
                        if user_agent:
                            user_agent = str(user_agent).strip()
                        
                        if not is_googlebot(user_agent):
                            continue
                        
                        stats_phase1['has_googlebot_ua'] += 1
                        
                        # Track variant
                        ua_lower = user_agent.lower()
                        for variant in ['googlebot-image', 'googlebot-news', 'googlebot-video', 
                                       'google-inspectiontool', 'adsbot-google', 'mediapartners-google']:
                            if variant in ua_lower:
                                key = variant.replace('-', ' ').title().replace(' ', '-')
                                googlebot_variants[key] = googlebot_variants.get(key, 0) + 1
                                break
                        else:
                            if 'googlebot' in ua_lower:
                                googlebot_variants['Googlebot (standard)'] = googlebot_variants.get('Googlebot (standard)', 0) + 1
                        
                        # Extract all required fields
                        ip = extract_first_ip(log_entry.get('http_x_forwarded_for', ''))
                        if ip:
                            unique_ips_phase1.add(ip)
                        
                        # Store record with all fields
                        record = {
                            'time_iso8601': str(log_entry.get('time_iso8601', '')).strip(),
                            'request_uri': str(log_entry.get('request_uri', '')).strip(),
                            'status': str(log_entry.get('status', '')).strip(),
                            'http_user_agent': user_agent,
                            'http_x_forwarded_for': ip if ip else '',
                            'geoip_country_code': str(log_entry.get('geoip_country_code', '')).strip(),
                            'upstream_response_time': str(log_entry.get('upstream_response_time', '')).strip(),
                            'bytes_sent': str(log_entry.get('bytes_sent', log_entry.get('body_bytes_sent', ''))).strip(),
                            'source_file': filename
                        }
                        
                        googlebot_records.append(record)
                        file_accepted += 1
                    
                    except Exception:
                        continue
                
                pbar.close()
            
            print(f"   ‚úÖ Accepted: {file_accepted:,} records with Googlebot UA")
        
        except Exception as e:
            print(f"   ‚ùå Error: {str(e)}")
            continue
    
    elapsed_phase1 = time.time() - start_phase1
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ PHASE 1 COMPLETE")
    print(f"{'=' * 80}")
    print(f"   Total processed: {stats_phase1['total_processed']:,}")
    print(f"   Googlebot UA filtered: {len(googlebot_records):,}")
    print(f"   Unique IPs: {len(unique_ips_phase1):,}")
    print(f"   Time: {elapsed_phase1:.1f}s ({elapsed_phase1/60:.1f} min)")
    
    # ========================================================================
    # PHASE 2: CIDR CHECK (IN-MEMORY)
    # ========================================================================
    print(f"\n{'=' * 80}")
    print(f"PHASE 2: CHECK IPS AGAINST CIDR RANGES (IN-MEMORY)")
    print(f"{'=' * 80}")
    
    print(f"\nüîç Checking {len(unique_ips_phase1):,} IPs against CIDR ranges...")
    verified_ips_cidr = {}
    for ip in tqdm(unique_ips_phase1, desc="Checking IPs", unit=" IPs"):
        verified_ips_cidr[ip] = verify_ip_in_range(ip, google_ip_networks)
    
    ips_in_range = sum(1 for v in verified_ips_cidr.values() if v)
    unique_ips_outside_cidr = {ip for ip, in_range in verified_ips_cidr.items() if not in_range}
    
    print(f"   ‚úÖ IPs in CIDR ranges: {ips_in_range:,}/{len(unique_ips_phase1):,}")
    print(f"   ‚ö†Ô∏è IPs outside ranges: {len(unique_ips_outside_cidr):,} (will check with FcrDNS in Phase 3)")
    
    elapsed_phase2 = time.time() - start_phase1 - elapsed_phase1
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ PHASE 2 COMPLETE")
    print(f"{'=' * 80}")
    print(f"   Time: {elapsed_phase2:.1f}s ({elapsed_phase2/60:.1f} min)")
    
    # ========================================================================
    # PHASE 3: FcrDNS + WRITE FINAL OUTPUT
    # ========================================================================
    print(f"\n{'=' * 80}")
    print(f"PHASE 3: FcrDNS VERIFICATION + WRITE FINAL OUTPUT")
    print(f"{'=' * 80}")
    
    dns_verified_ips = {}
    
    if ENABLE_FCRDNS_FALLBACK and len(unique_ips_outside_cidr) > 0:
        print(f"\nüåê Starting FcrDNS verification for {len(unique_ips_outside_cidr):,} IPs outside CIDR ranges...")
        print(f"   Using {MAX_DNS_WORKERS} workers")
        
        start_dns = time.time()
        
        dns_verified_ips = verify_ips_parallel_dns(unique_ips_outside_cidr, desc="FcrDNS Verification")
        
        dns_pass = sum(1 for v in dns_verified_ips.values() if v)
        elapsed_dns = time.time() - start_dns
        
        print(f"\n‚úÖ FcrDNS Verification Results:")
        print(f"   Passed: {dns_pass:,}/{len(unique_ips_outside_cidr):,}")
        print(f"   Time: {elapsed_dns:.1f}s ({elapsed_dns/60:.1f} min)")
        if len(unique_ips_outside_cidr) > 0:
            print(f"   Speed: {len(unique_ips_outside_cidr)/elapsed_dns:.0f} IPs/sec")
    
    print(f"\nüìù Writing final verified records (with search-API filtering)...")
    
    phase3_files = []
    total_phase3_before_filter = 0
    total_phase3_after_filter = 0
    search_api_filtered = 0
    
    rejection_stats = {
        'in_cidr': 0,
        'missing_ip_accepted': 0,
        'invalid_ip_accepted': 0,
        'fcrdns_passed': 0,
        'fcrdns_failed_rejected': 0
    }
    
    # Group records by source file
    records_by_file = {}
    for record in googlebot_records:
        source_file = record['source_file']
        if source_file not in records_by_file:
            records_by_file[source_file] = []
        records_by_file[source_file].append(record)
    
    start_write = time.time()
    
    for source_file, records in tqdm(sorted(records_by_file.items()), desc="Writing files", unit=" files"):
        file_timestamp = extract_full_timestamp_from_filename(source_file)
        
        if file_timestamp:
            base_filename = f"googlebot_{file_timestamp}"
        else:
            base_filename = f"googlebot_file"
        
        current_file_index = 1
        current_file_records = 0
        current_writer = None
        current_csvfile = None
        
        def create_output_file(file_index):
            if file_index == 1:
                csv_filename = os.path.join(output_folder, f"{base_filename}.csv")
            else:
                csv_filename = os.path.join(output_folder, f"{base_filename}_part{file_index}.csv")
            
            csvfile = open(csv_filename, 'w', newline='', encoding='utf-8-sig')
            writer = csv.DictWriter(csvfile, fieldnames=[
                'time_iso8601', 'request_uri', 'status', 'http_user_agent',
                'http_x_forwarded_for', 'geoip_country_code', 
                'upstream_response_time', 'bytes_sent'
            ], quoting=csv.QUOTE_ALL)
            writer.writeheader()
            phase3_files.append(csv_filename)
            return csvfile, writer
        
        current_csvfile, current_writer = create_output_file(current_file_index)
        
        for record in records:
            ip = record['http_x_forwarded_for'].strip()
            
            # Determine if record should be accepted
            accept_record = False
            
            if not ip or ip == '' or ip.lower() in ['none', 'null', '-', 'unknown']:
                if LENIENT_MODE:
                    accept_record = True
                    rejection_stats['missing_ip_accepted'] += 1
            elif ip in verified_ips_cidr:
                if verified_ips_cidr[ip]:
                    accept_record = True
                    rejection_stats['in_cidr'] += 1
                else:
                    if dns_verified_ips.get(ip, False):
                        accept_record = True
                        rejection_stats['fcrdns_passed'] += 1
                    else:
                        rejection_stats['fcrdns_failed_rejected'] += 1
            else:
                if LENIENT_MODE:
                    accept_record = True
                    rejection_stats['invalid_ip_accepted'] += 1
            
            if accept_record:
                total_phase3_before_filter += 1
                
                # Check for search-API URL
                if is_search_api_url(record['request_uri']):
                    search_api_filtered += 1
                    continue  # Skip this record
                
                total_phase3_after_filter += 1
                
                if current_file_records >= max_records_per_file:
                    current_csvfile.close()
                    current_file_index += 1
                    current_csvfile, current_writer = create_output_file(current_file_index)
                    current_file_records = 0
                
                current_writer.writerow({
                    'time_iso8601': record['time_iso8601'],
                    'request_uri': record['request_uri'],
                    'status': record['status'],
                    'http_user_agent': record['http_user_agent'],
                    'http_x_forwarded_for': record['http_x_forwarded_for'],
                    'geoip_country_code': record['geoip_country_code'],
                    'upstream_response_time': record['upstream_response_time'],
                    'bytes_sent': record['bytes_sent']
                })
                current_file_records += 1
        
        if current_csvfile and not current_csvfile.closed:
            current_csvfile.close()
    
    elapsed_write = time.time() - start_write
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ PHASE 3 COMPLETE")
    print(f"{'=' * 80}")
    print(f"   Before search-API filter: {total_phase3_before_filter:,}")
    print(f"   Search-API URLs filtered: {search_api_filtered:,}")
    print(f"   After search-API filter: {total_phase3_after_filter:,}")
    print(f"   Breakdown:")
    print(f"   - CIDR verified: {rejection_stats['in_cidr']:,}")
    print(f"   - Missing IP (lenient): {rejection_stats['missing_ip_accepted']:,}")
    print(f"   - Invalid IP (lenient): {rejection_stats['invalid_ip_accepted']:,}")
    print(f"   - FcrDNS passed: {rejection_stats['fcrdns_passed']:,}")
    print(f"   - FcrDNS failed (rejected): {rejection_stats['fcrdns_failed_rejected']:,}")
    print(f"   Output files: {len(phase3_files)}")
    
    # ========================================================================
    # SUMMARY
    # ========================================================================
    total_time = time.time() - start_phase1
    
    gsc_expected = 14913024
    match_rate = (total_phase3_after_filter / gsc_expected * 100) if total_phase3_after_filter > 0 else 0
    
    print(f"\n{'=' * 80}")
    print(f"‚úÖ COMPLETE: IN-MEMORY 3-PHASE EXTRACTION")
    print(f"{'=' * 80}")
    
    print(f"\n‚è±Ô∏è  PERFORMANCE:")
    print(f"   Phase 1 (UA Filter): {elapsed_phase1:.1f}s ({elapsed_phase1/60:.1f} min)")
    print(f"   Phase 2 (CIDR Check): {elapsed_phase2:.1f}s ({elapsed_phase2/60:.1f} min)")
    print(f"   Phase 3 (FcrDNS + Write): {elapsed_write:.1f}s ({elapsed_write/60:.1f} min)")
    print(f"   TOTAL: {total_time:.1f}s ({total_time/60:.1f} min)")
    
    print(f"\nüìä RESULTS:")
    print(f"   Phase 1: {len(googlebot_records):,} (Googlebot UA)")
    print(f"   Phase 2: {len(unique_ips_phase1):,} unique IPs checked")
    print(f"   Phase 3 (before search-API): {total_phase3_before_filter:,}")
    print(f"   Phase 3 (after search-API): {total_phase3_after_filter:,} ‚úÖ")
    
    print(f"\nü§ñ GOOGLEBOT VARIANTS:")
    for variant, count in sorted(googlebot_variants.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"   ‚Ä¢ {variant}: {count:,}")
    
    print(f"\nüéØ GSC COMPARISON:")
    print(f"   GSC: {gsc_expected:,}")
    print(f"   Output: {total_phase3_after_filter:,}")
    print(f"   Difference: {abs(gsc_expected - total_phase3_after_filter):,}")
    print(f"   Match: {match_rate:.1f}%")
    
    if match_rate >= 98:
        print(f"   üèÜ NEAR-PERFECT!")
    elif match_rate >= 95:
        print(f"   ‚úÖ EXCELLENT MATCH!")
    elif match_rate >= 90:
        print(f"   ‚úÖ VERY GOOD!")
    
    # Create ZIP
    if create_zip:
        zip_filename = create_zip_archive(phase3_files, output_folder, "googlebot_final")
    
    return {
        'output_folder': abs_output_folder,
        'phase3_files': phase3_files,
        'final_records': total_phase3_after_filter,
        'search_api_filtered': search_api_filtered,
        'match_rate': match_rate,
        'rejection_stats': rejection_stats
    }

# ============================================================================
# USAGE
# ============================================================================

print("=" * 80)
print("üéØ IN-MEMORY 3-PHASE GOOGLEBOT EXTRACTION")
print("   Phase 1-2: In-Memory | Phase 3: Final Output + Search-API Filter")
print("=" * 80)

INPUT_FOLDER = input("\nüìÇ Enter folder path (or empty for file): ").strip()

if not INPUT_FOLDER:
    INPUT_FILE = input("üìÑ Enter file path: ").strip()
    results = process_log_files_final(
        input_file=INPUT_FILE,
        output_folder="googlebot_final_only",
        max_records_per_file=500000,
        create_zip=True
    )
else:
    results = process_log_files_final(
        input_folder=INPUT_FOLDER,
        output_folder="googlebot_final_only",
        max_records_per_file=500000,
        create_zip=True
    )

if results:
    print(f"\nüéâ DONE!")
    print(f"üìÇ {results['output_folder']}")
    print(f"üìã Final: {results['final_records']:,} records")
    print(f"üö´ Search-API filtered: {results['search_api_filtered']:,} records")
    #print(f"üéØ Match: {results['match_rate']:.1f}%")
