In [3]:
import os
import shutil
import subprocess
from pathlib import Path

def clean_conda_pip_cache():
    print("üßπ STARTING CLEANUP: Conda & Pip Caches")
    print("="*60)
    
    # 1. CONDA CLEANUP
    # Using 'conda clean --all' is the standard way.
    # We use subprocess to run the command line tool.
    print("\nüì¶ Running Conda Cleanup...")
    try:
        # -a: all (index cache, lock files, unused cache packages, tarballs)
        # -y: yes (do not ask for confirmation)
        result = subprocess.run(["conda", "clean", "-a", "-y"], capture_output=True, text=True)
        if result.returncode == 0:
            print("‚úÖ Conda clean successful.")
            # Print a summary of what was removed (first few lines usually contain the size)
            print("\n".join(result.stdout.split('\n')[:5]) + "...") 
        else:
            print(f"‚ö†Ô∏è Conda clean warning:\n{result.stderr}")
    except Exception as e:
        print(f"‚ùå Error running conda clean: {e}")

    # 2. PIP CLEANUP
    # Pip cache is usually at ~/.cache/pip or %LocalAppData%\pip\Cache
    print("\nüêç Running Pip Cleanup...")
    
    # Attempt to use pip's internal cache purge command
    try:
        result = subprocess.run(["pip", "cache", "purge"], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"‚úÖ Pip cache purge successful: {result.stdout.strip()}")
        else:
            print(f"‚ö†Ô∏è Pip purge warning:\n{result.stderr}")
    except FileNotFoundError:
        print("‚ÑπÔ∏è 'pip' command not found in path. Trying module execution...")
        # Fallback: python -m pip cache purge
        subprocess.run(["python", "-m", "pip", "cache", "purge"])

    # 3. JUPYTER CHECKPOINTS CLEANUP (Optional but recommended)
    print("\nüìì Cleaning Jupyter Checkpoints (.ipynb_checkpoints)...")
    cwd = Path(os.getcwd())
    count = 0
    size_cleared = 0
    
    for p in cwd.rglob(".ipynb_checkpoints"):
        if p.is_dir():
            try:
                # Calculate size before deleting
                current_size = sum(f.stat().st_size for f in p.rglob('*') if f.is_file())
                shutil.rmtree(p)
                count += 1
                size_cleared += current_size
            except Exception as e:
                print(f"   ‚ö†Ô∏è Could not remove {p}: {e}")
    
    mb_cleared = size_cleared / (1024 * 1024)
    print(f"‚úÖ Removed {count} checkpoint folders (~{mb_cleared:.2f} MB)")

    print("\n" + "="*60)
    print("‚ú® CLEANUP COMPLETE")

# Run the function
clean_conda_pip_cache()


üßπ STARTING CLEANUP: Conda & Pip Caches

üì¶ Running Conda Cleanup...
‚úÖ Conda clean successful.
There are no unused tarball(s) to remove.
There are no index cache(s) to remove.
There are no unused package(s) to remove.
There are no tempfile(s) to remove.
There are no logfile(s) to remove....

üêç Running Pip Cleanup...
‚úÖ Pip cache purge successful: Files removed: 0 (0 bytes)

üìì Cleaning Jupyter Checkpoints (.ipynb_checkpoints)...
‚úÖ Removed 2 checkpoint folders (~61.39 MB)

‚ú® CLEANUP COMPLETE


In [2]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm
import warnings
import gc
import csv
import sys
import zlib
from io import BytesIO

warnings.filterwarnings('ignore')

# ============================================================================
# 1. AUTO-CREATE WORKER FILE (Fixes ModuleNotFoundError)
# ============================================================================
worker_code = """
import requests
import zlib
from lxml import etree
from io import BytesIO

def parse_gz_sitemap_worker(gz_url):
    urls_data = []
    try:
        with requests.get(gz_url, stream=True, timeout=60) as response:
            if response.status_code != 200:
                return []
            d = zlib.decompressobj(16 + zlib.MAX_WBITS)
            def stream_generator():
                for chunk in response.iter_content(chunk_size=65536):
                    if chunk:
                        yield d.decompress(chunk)
            context = etree.iterparse(
                BytesIO(b"".join(stream_generator())), 
                events=('end',), 
                tag='{http://www.sitemaps.org/schemas/sitemap/0.9}url'
            )
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9', 'image': 'http://www.google.com/schemas/sitemap-image/1.1'}
            for event, elem in context:
                try:
                    loc = elem.find('sm:loc', ns)
                    url_text = loc.text if loc is not None else None
                    if url_text:
                        lastmod = elem.find('sm:lastmod', ns)
                        image_elem = elem.find('image:image', ns)
                        image_caption = ""
                        if image_elem is not None:
                            cap_node = image_elem.find('image:caption', ns)
                            image_caption = cap_node.text if cap_node is not None else ""
                            if not image_caption:
                                title_node = image_elem.find('image:title', ns)
                                image_caption = title_node.text if title_node is not None else ""
                        if not image_caption:
                            image_caption = url_text.split('/')[-1].replace('-', ' ').replace('.html', '')
                        urls_data.append({'url': url_text, 'last_modified': lastmod.text if lastmod is not None else None, 'image_caption': image_caption})
                except: pass
                finally:
                    elem.clear()
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
            del context
            return urls_data
    except: return []
"""

# Write the file to current directory
with open('sitemap_worker.py', 'w', encoding='utf-8') as f:
    f.write(worker_code)

# Ensure current dir is in path and import
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

try:
    from sitemap_worker import parse_gz_sitemap_worker
    print("‚úÖ Worker module loaded successfully.")
except ImportError as e:
    print(f"‚ùå Failed to load worker module: {e}")

# ============================================================================
# 2. MAIN LOGIC (MULTIPROCESSING)
# ============================================================================
def detect_stale_pages_bouncer(log_folder_path, sitemap_list_path, output_dir='stale_pages_output', 
                               max_records_per_file=500000, max_gz_per_index=None,
                               log_batch_size=5, workers=None):
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Auto-detect optimal workers if not specified
    if workers is None:
        workers = max(1, os.cpu_count() - 2)
    
    # --- Check Sitemap List File ---
    if not os.path.exists(sitemap_list_path):
        print(f"‚ùå Sitemap list file not found: {sitemap_list_path}")
        return None
        
    try:
        with open(sitemap_list_path, 'r', encoding='utf-8') as f:
            sitemap_urls = [line.strip() for line in f if line.strip()]
    except Exception as e:
        print(f"‚ùå Error reading sitemap list file: {e}")
        return None
    
    if not sitemap_urls:
        print("‚ùå Sitemap list file is empty")
        return None

    print("="*80)
    print(f"üï∫ STALE PAGE DETECTION: THE PARTY BOUNCER (MULTIPROCESSING MODE)")
    print("="*80)
    print(f"üìÅ Log folder: {log_folder_path}")
    print(f"üìÑ Sitemap list: {sitemap_list_path}")
    print(f"‚öôÔ∏è  Workers: {workers} (ProcessPoolExecutor)")
    print(f"üó∫Ô∏è  Processing {len(sitemap_urls)} sitemap indexes from list")
    
    # --- Step 1: Processing Logs ---
    print("\nüìñ Step 1: Building the Guest List (Processing Logs)...")
    
    if not os.path.exists(log_folder_path):
        print(f"‚ùå Folder not found: {log_folder_path}")
        return None

    log_files = [f for f in os.listdir(log_folder_path) if f.endswith('.csv')]
    
    if not log_files:
        print("‚ùå No CSV files found.")
        return None

    all_log_stats = []
    
    for i in tqdm(range(0, len(log_files), log_batch_size), desc="   Processing Logs"):
        batch_files = log_files[i:i+log_batch_size]
        dfs = []
        for file in batch_files:
            try:
                df = pd.read_csv(os.path.join(log_folder_path, file), 
                               usecols=['request_uri', 'http_user_agent', 'time_iso8601', 'status'], 
                               encoding='utf-8-sig', low_memory=False)
                dfs.append(df)
            except: continue
            
        if dfs:
            batch_df = pd.concat(dfs)
            batch_df['url'] = 'https://www.alamy.com' + batch_df['request_uri'].str.split('?').str[0].fillna('')
            batch_df['timestamp'] = pd.to_datetime(batch_df['time_iso8601'], errors='coerce')
            
            min_date = batch_df['timestamp'].min()
            max_date = batch_df['timestamp'].max()
            days = (max_date - min_date).days + 1 if pd.notnull(max_date) else 1
            
            batch_stats = batch_df.groupby('url').agg({
                'request_uri': 'count',
                'http_user_agent': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
                'status': lambda x: x.mode()[0] if len(x.mode()) > 0 else 200
            }).reset_index()
            batch_stats.columns = ['url', 'crawl_count', 'user_agent', 'status_code']
            batch_stats['days_active'] = days
            
            all_log_stats.append(batch_stats)
            del batch_df, dfs
            gc.collect()

    if not all_log_stats:
        print("‚ùå No log data processed.")
        return None

    print("   ‚îú‚îÄ Finalizing Guest List...")
    full_log_stats = pd.concat(all_log_stats).groupby('url').agg({
        'crawl_count': 'sum',
        'user_agent': 'first',
        'status_code': 'first',
        'days_active': 'max'
    }).reset_index()
    
    full_log_stats['crawl_frequency'] = full_log_stats['crawl_count'] / full_log_stats['days_active']
    full_log_stats['authority_score'] = (full_log_stats['crawl_count'] * 0.7 + 
                                         full_log_stats['crawl_frequency'] * full_log_stats['days_active'] * 0.3)
    
    threshold = full_log_stats['crawl_count'].quantile(0.10)
    print(f"   ‚îú‚îÄ Bottom 10% Threshold: <= {threshold} crawls")
    
    invited_guests = set(full_log_stats['url'])
    wallflowers_df = full_log_stats[full_log_stats['crawl_count'] <= threshold]
    wallflowers = set(wallflowers_df['url'])
    
    print(f"‚úÖ Guest List Ready: {len(invited_guests):,} total, {len(wallflowers):,} low activity")

    # --- Step 2: Sitemaps ---
    print("\nüì• Step 2: Checking Sitemaps (Multiprocessing)...")
    
    def parse_sitemap_index(index_url):
        try:
            response = requests.get(index_url, timeout=60)
            root = ET.fromstring(response.content)
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            return [loc.text for loc in root.findall('sm:sitemap/sm:loc', ns)]
        except:
            return []

    stale_party_list = []
    total_processed_urls = 0
    
    def check_at_gate(batch_df):
        batch_df['is_invited'] = batch_df['url'].isin(invited_guests)
        batch_df['is_wallflower'] = batch_df['url'].isin(wallflowers)
        
        orphans = batch_df[~batch_df['is_invited']].copy()
        orphans['page_type'] = 'Orphan'
        orphans['crawl_count'] = 0
        orphans['authority_score'] = 0.0
        orphans['crawl_frequency'] = 0.0
        orphans['user_agent'] = 'Not Crawled'
        orphans['status_code'] = 0  
        
        low_act = batch_df[batch_df['is_wallflower']].copy()
        low_act['page_type'] = 'Low Activity'
        
        if not low_act.empty:
            low_act = low_act.merge(
                full_log_stats[['url', 'crawl_count', 'authority_score', 'crawl_frequency', 'user_agent', 'status_code']], 
                on='url', how='left'
            )
            low_act['status_code'] = low_act['status_code'].fillna(200)
        
        return pd.concat([orphans, low_act])

    for idx_num, index_url in enumerate(sitemap_urls, 1):
        print(f"\n   ‚îú‚îÄ Index {idx_num}/{len(sitemap_urls)}: {index_url.split('/')[-1]}")
        gz_urls = parse_sitemap_index(index_url)
        if not gz_urls: continue
        if max_gz_per_index: gz_urls = gz_urls[:max_gz_per_index]
            
        print(f"   ‚îÇ  ‚îú‚îÄ Spawning {workers} PROCESSES for {len(gz_urls)} files...")
        
        with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
            futures = {executor.submit(parse_gz_sitemap_worker, url): url for url in gz_urls}
            batch_results = []
            
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(gz_urls), desc="   ‚îÇ  ‚îî‚îÄ Parsing", leave=False):
                try:
                    result = future.result()
                    if result: batch_results.extend(result)
                except Exception:
                    pass
                
                if len(batch_results) > 200000:
                    batch_df = pd.DataFrame(batch_results)
                    total_processed_urls += len(batch_df)
                    processed_batch = check_at_gate(batch_df)
                    if not processed_batch.empty:
                        stale_party_list.append(processed_batch)
                    batch_results = []
                    del batch_df
                    gc.collect()
            
            if batch_results:
                batch_df = pd.DataFrame(batch_results)
                total_processed_urls += len(batch_df)
                processed_batch = check_at_gate(batch_df)
                if not processed_batch.empty:
                    stale_party_list.append(processed_batch)
                del batch_df
                gc.collect()

    # --- Step 3: Saving ---
    print("\nüíæ Step 3: Consolidating and Saving...")
    
    if not stale_party_list:
        print("‚ùå No stale pages found.")
        return None
        
    final_df = pd.concat(stale_party_list, ignore_index=True)
    
    current_date = pd.Timestamp.now()
    final_df['last_modified'] = pd.to_datetime(final_df['last_modified'], errors='coerce')
    final_df['days_since_modified'] = (current_date - final_df['last_modified']).dt.days
    
    final_df['priority_score'] = 0
    final_df.loc[final_df['days_since_modified'] > 180, 'priority_score'] = 100
    final_df.loc[(final_df['days_since_modified'] > 90) & (final_df['days_since_modified'] <= 180), 'priority_score'] = 70
    final_df.loc[final_df['days_since_modified'] <= 90, 'priority_score'] = 40
    
    final_df = final_df.sort_values(['page_type', 'priority_score'], ascending=[False, False])
    
    output_cols = ['url', 'crawl_count', 'authority_score', 'crawl_frequency', 'user_agent', 
                   'status_code', 'page_type', 'priority_score', 'days_since_modified', 'last_modified',
                   'image_caption']
    
    final_cols = [c for c in output_cols if c in final_df.columns]
    final_df = final_df[final_cols]
    
    if len(final_df) <= max_records_per_file:
        output_path = os.path.join(output_dir, 'stale_pages.csv')
        final_df.to_csv(output_path, index=False, encoding='utf-8-sig', quoting=csv.QUOTE_ALL)
        print(f"   ‚îî‚îÄ Saved: {output_path}")
    else:
        num_parts = (len(final_df) // max_records_per_file) + 1
        for i in range(num_parts):
            start_idx = i * max_records_per_file
            end_idx = min((i + 1) * max_records_per_file, len(final_df))
            part_df = final_df.iloc[start_idx:end_idx]
            output_path = os.path.join(output_dir, f'stale_pages_part{i+1}.csv')
            part_df.to_csv(output_path, index=False, encoding='utf-8-sig', quoting=csv.QUOTE_ALL)
        print(f"   ‚îî‚îÄ Saved {num_parts} files")

    print("\n" + "="*80)
    print("üìä FINAL STATS")
    print("="*80)
    print(f"‚úÖ Total Analyzed: {total_processed_urls:,}")
    print(f"‚úÖ Stale Found: {len(final_df):,}")
    print(f"   ‚Ä¢ Orphans: {len(final_df[final_df['page_type']=='Orphan']):,}")
    print(f"   ‚Ä¢ Low Activity: {len(final_df[final_df['page_type']=='Low Activity']):,}")
    print(f"üìÅ Output Location: {os.path.abspath(output_dir)}")
    print("="*80)
    
    return final_df

# ============================================================================
# 3. UI WIDGETS
# ============================================================================
log_folder_input = widgets.Text(
    value='', 
    placeholder='D:\\path\\to\\log\\files', 
    description='Log Folder:', 
    layout=widgets.Layout(width='600px')
)

sitemap_file_input = widgets.Text(
    value='sitemap_list.txt',
    placeholder='Enter path to sitemap.txt file', 
    description='Sitemap List:', 
    layout=widgets.Layout(width='600px')
)

test_mode_checkbox = widgets.Checkbox(value=True, description='Test Mode')
log_batch_input = widgets.IntText(value=5, description='Log Batch:')
gz_batch_input = widgets.IntText(value=max(1, os.cpu_count() - 2), description='Workers (CPU):')

run_button = widgets.Button(description='üöÄ Run Party Bouncer', button_style='success', icon='check', layout=widgets.Layout(width='300px'))
output_area = widgets.Output()

def on_run_clicked(b):
    with output_area:
        clear_output()
        max_gz = 10 if test_mode_checkbox.value else None
        detect_stale_pages_bouncer(
            log_folder_input.value, 
            sitemap_file_input.value,
            max_gz_per_index=max_gz, 
            log_batch_size=log_batch_input.value, 
            workers=gz_batch_input.value
        )

run_button.on_click(on_run_clicked)
display(widgets.VBox([
    widgets.HTML("<h2>üï∫ Stale Page Detection: Party Bouncer (Multiprocessing)</h2>"), 
    log_folder_input, 
    sitemap_file_input, 
    widgets.HBox([log_batch_input, gz_batch_input]), 
    test_mode_checkbox, 
    run_button, 
    output_area
]))


‚úÖ Worker module loaded successfully.


VBox(children=(HTML(value='<h2>üï∫ Stale Page Detection: Party Bouncer (Multiprocessing)</h2>'), Text(value='', ‚Ä¶