In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully")


‚úÖ Libraries imported successfully


In [2]:
def analyze_sitemap_index(sitemap_index_url):
    """
    Analyze Alamy sitemap index to count .gz files
    """
    print("üîç Fetching sitemap index...")
    print(f"üìç URL: {sitemap_index_url}\n")
    
    try:
        # Fetch the sitemap index
        response = requests.get(sitemap_index_url, timeout=30)
        response.raise_for_status()
        
        print(f"‚úÖ Successfully fetched")
        print(f"   ‚Ä¢ Size: {len(response.content):,} bytes")
        print(f"   ‚Ä¢ Content-Type: {response.headers.get('Content-Type')}\n")
        
        # Parse XML
        root = ET.fromstring(response.content)
        
        # Define namespace
        namespace = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        # Find all sitemap <loc> tags
        sitemap_locs = root.findall('sm:sitemap/sm:loc', namespace)
        
        # Extract URLs
        gz_urls = [loc.text for loc in sitemap_locs]
        
        print("="*80)
        print("üìä SITEMAP INDEX ANALYSIS")
        print("="*80)
        print(f"\n‚úÖ Total .gz sitemap files found: {len(gz_urls):,}")
        print(f"üì¶ Estimated URLs per .gz file: ~10,000")
        print(f"üìà Total estimated URLs in this index: ~{len(gz_urls) * 10000:,}")
        
        # Show samples
        print(f"\nüìã First 10 .gz sitemap URLs:")
        print("-" * 80)
        for i, url in enumerate(gz_urls[:10], 1):
            print(f"{i:4d}. {url}")
        
        if len(gz_urls) > 10:
            print(f"       ... ({len(gz_urls) - 10:,} more) ...")
        
        print(f"\nüìã Last 5 .gz sitemap URLs:")
        print("-" * 80)
        for i, url in enumerate(gz_urls[-5:], len(gz_urls)-4):
            print(f"{i:4d}. {url}")
        
        # Save to DataFrame
        df = pd.DataFrame({'gz_sitemap_url': gz_urls})
        df['index_name'] = sitemap_index_url.split('/')[-1]
        df['estimated_urls'] = 10000
        
        # Save to CSV
        output_file = 'sitemap_gz_urls.csv'
        df.to_csv(output_file, index=False)
        
        print(f"\nüíæ Saved all {len(gz_urls):,} .gz URLs to: {output_file}")
        
        return df
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Network Error: {e}")
        return None
    except ET.ParseError as e:
        print(f"‚ùå XML Parsing Error: {e}")
        return None
    except Exception as e:
        print(f"‚ùå Unexpected Error: {e}")
        return None


# Run the analysis
sitemap_index_url = 'https://www.alamy.com/sitemaps/image_daily_index_s_1_10000000.xml'
df_gz_urls = analyze_sitemap_index(sitemap_index_url)

# Display results
if df_gz_urls is not None:
    print(f"\n‚úÖ Analysis complete!")
    display(df_gz_urls.head(10))


üîç Fetching sitemap index...
üìç URL: https://www.alamy.com/sitemaps/image_daily_index_s_1_10000000.xml

‚úÖ Successfully fetched
   ‚Ä¢ Size: 148,403 bytes
   ‚Ä¢ Content-Type: text/xml

üìä SITEMAP INDEX ANALYSIS

‚úÖ Total .gz sitemap files found: 990
üì¶ Estimated URLs per .gz file: ~10,000
üìà Total estimated URLs in this index: ~9,900,000

üìã First 10 .gz sitemap URLs:
--------------------------------------------------------------------------------
   1. https://www.alamy.com/sitemaps/image_daily_sitemap_s_1960001_1970000.xml.gz
   2. https://www.alamy.com/sitemaps/image_daily_sitemap_s_5180001_5190000.xml.gz
   3. https://www.alamy.com/sitemaps/image_daily_sitemap_s_1080001_1090000.xml.gz
   4. https://www.alamy.com/sitemaps/image_daily_sitemap_s_1050001_1060000.xml.gz
   5. https://www.alamy.com/sitemaps/image_daily_sitemap_s_6870001_6880000.xml.gz
   6. https://www.alamy.com/sitemaps/image_daily_sitemap_s_7310001_7320000.xml.gz
   7. https://www.alamy.com/sitemaps/imag

Unnamed: 0,gz_sitemap_url,index_name,estimated_urls
0,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
1,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
2,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
3,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
4,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
5,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
6,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
7,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
8,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000
9,https://www.alamy.com/sitemaps/image_daily_sit...,image_daily_index_s_1_10000000.xml,10000


In [3]:
import gzip
from io import BytesIO

def sample_gz_sitemap(gz_url, max_urls=100):
    """
    Download and parse a sample .gz sitemap to verify structure
    """
    print(f"üîç Sampling: {gz_url.split('/')[-1]}\n")
    
    try:
        # Download .gz file
        response = requests.get(gz_url, timeout=60, stream=True)
        response.raise_for_status()
        
        print(f"‚úÖ Downloaded ({len(response.content):,} bytes)")
        
        # Decompress
        with gzip.GzipFile(fileobj=BytesIO(response.content)) as gz:
            xml_content = gz.read()
        
        print(f"‚úÖ Decompressed ({len(xml_content):,} bytes)")
        
        # Parse XML
        root = ET.fromstring(xml_content)
        
        ns = {
            'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9',
            'image': 'http://www.google.com/schemas/sitemap-image/1.1'
        }
        
        urls = []
        
        for url_elem in root.findall('sm:url', ns):
            loc = url_elem.find('sm:loc', ns)
            page_url = loc.text if loc is not None else None
            
            lastmod = url_elem.find('sm:lastmod', ns)
            lastmod_date = lastmod.text if lastmod is not None else None
            
            image_elem = url_elem.find('image:image', ns)
            if image_elem is not None:
                caption_elem = image_elem.find('image:caption', ns)
                caption = caption_elem.text if caption_elem is not None else None
            else:
                caption = None
            
            urls.append({
                'page_url': page_url,
                'image_caption': caption,
                'last_modified': lastmod_date
            })
            
            if len(urls) >= max_urls:
                break
        
        df = pd.DataFrame(urls)
        
        print(f"\nüìä Extracted {len(df):,} sample URLs")
        print(f"   ‚Ä¢ With captions: {df['image_caption'].notna().sum():,}")
        print(f"   ‚Ä¢ Date range: {df['last_modified'].min()} to {df['last_modified'].max()}")
        
        return df
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None


# Test with first .gz file from the index
if df_gz_urls is not None and len(df_gz_urls) > 0:
    first_gz_url = df_gz_urls.iloc[0]['gz_sitemap_url']
    sample_df = sample_gz_sitemap(first_gz_url, max_urls=100)
    
    if sample_df is not None:
        display(sample_df.head(10))


üîç Sampling: image_daily_sitemap_s_1960001_1970000.xml.gz

‚úÖ Downloaded (50,758 bytes)
‚úÖ Decompressed (618,916 bytes)

üìä Extracted 100 sample URLs
   ‚Ä¢ With captions: 100
   ‚Ä¢ Date range: 2022-07-26 to 2025-10-27


Unnamed: 0,page_url,image_caption,last_modified
0,https://www.alamy.com/adare-village-county-lim...,animal drawn wheeled vehicle sits on the stree...,2022-07-27
1,https://www.alamy.com/rural-irish-scenic-town-...,"inland river makes its way to the sea, river p...",2022-07-28
2,https://www.alamy.com/rocky-peninsula-juts-out...,"to the waters and the wild, an ocean apart, ti...",2022-07-26
3,https://www.alamy.com/storm-atlantic-waves-cra...,"natures way, between a rock and a hard place, ...",2022-07-28
4,https://www.alamy.com/large-atlantic-waves-bre...,"wave crest, to the waters and the wild, an oce...",2022-07-29
5,https://www.alamy.com/deep-blue-atlantic-ocean...,"between the devil and the deep blue sea, an oc...",2022-07-28
6,https://www.alamy.com/colorful-boats-line-the-...,small lake fishing boats tied up on a lakeside...,2022-07-28
7,https://www.alamy.com/seven-and-a-half-mile-lo...,"brandon bay, dingle peninsula, foot prints on ...",2022-07-26
8,https://www.alamy.com/sun-setting-on-irelands-...,"to the waters and the wild, between a rock and...",2022-07-27
9,https://www.alamy.com/early-morning-sunrise-ov...,"beara peninsula, to the waters and the wild, a...",2022-07-28


In [8]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Complete list of Alamy sitemap index URLs
sitemap_index_urls = [
    'https://www.alamy.com/sitemaps/image_daily_index_s_1_10000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_10000001_20000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_20000001_30000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_30000001_40000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_40000001_50000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_50000001_60000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_60000001_70000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_70000001_80000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_80000001_90000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_90000001_100000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_100000001_110000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_110000001_120000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_120000001_130000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_130000001_140000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_140000001_150000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_150000001_160000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_160000001_170000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_170000001_180000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_180000001_190000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_sn_180000001_190000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_200000001_210000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_210000001_220000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_220000001_230000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_230000001_240000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_240000001_250000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_250000001_260000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_260000001_270000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_270000001_280000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_si_300000001_310000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_si_310000001_320000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_320000001_330000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_330000001_340000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_340000001_350000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_350000001_360000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_360000001_370000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_370000001_380000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_380000001_390000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_390000001_400000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_400000001_410000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_410000001_420000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_420000001_430000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_430000001_440000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_440000001_450000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_450000001_460000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_460000001_470000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_470000001_480000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_480000001_490000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_490000001_500000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_500000001_510000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_510000001_520000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_520000001_530000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_530000001_540000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_540000001_550000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_550000001_560000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_560000001_570000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_570000001_580000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_580000001_590000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_590000001_600000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_600000001_610000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_610000001_620000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_620000001_630000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_630000001_640000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_640000001_650000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_650000001_660000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_660000001_670000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_670000001_680000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_680000001_690000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_690000001_700000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_700000001_710000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_a_020000001_030000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_b_020000001_030000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_c_020000001_030000000.xml',
    'https://www.alamy.com/sitemaps/image_daily_index_s_x_020000001_030000000.xml',
]

def count_all_sitemaps(index_urls):
    """
    Count total .gz files across all indexes
    """
    results = []
    
    for index_url in tqdm(index_urls, desc="Processing indexes"):
        try:
            response = requests.get(index_url, timeout=30)
            root = ET.fromstring(response.content)
            
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            gz_count = len(root.findall('sm:sitemap/sm:loc', ns))
            
            results.append({
                'index_url': index_url,
                'index_name': index_url.split('/')[-1],
                'gz_file_count': gz_count,
                'estimated_urls': gz_count * 10000
            })
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error with {index_url.split('/')[-1]}: {e}")
            results.append({
                'index_url': index_url,
                'index_name': index_url.split('/')[-1],
                'gz_file_count': 0,
                'estimated_urls': 0
            })
    
    return pd.DataFrame(results)


# Print basic info
print(f"üìä Total sitemap indexes: {len(sitemap_index_urls)}")
print(f"   ‚Ä¢ Image indexes: {len([url for url in sitemap_index_urls if 'image' in url])}")
print(f"   ‚Ä¢ Estimated total URLs: ~{len(sitemap_index_urls) * 10_000_000:,}")

# Run analysis
print("\nüîç Analyzing all sitemap indexes...\n")
df_all_indexes = count_all_sitemaps(sitemap_index_urls)

# Display results
print("\n" + "="*80)
print("üìä ANALYSIS COMPLETE")
print("="*80)
print(f"   ‚Ä¢ Total indexes processed: {len(df_all_indexes)}")
print(f"   ‚Ä¢ Total .gz files: {df_all_indexes['gz_file_count'].sum():,}")
print(f"   ‚Ä¢ Total estimated URLs: {df_all_indexes['estimated_urls'].sum():,}")
print("="*80)

# Display the dataframe
display(df_all_indexes)

# Save to CSV
df_all_indexes.to_csv('sitemap_index_analysis.csv', index=False)
print("\nüíæ Saved analysis to: sitemap_index_analysis.csv")


üìä Total sitemap indexes: 73
   ‚Ä¢ Image indexes: 73
   ‚Ä¢ Estimated total URLs: ~730,000,000

üîç Analyzing all sitemap indexes...



Processing indexes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 73/73 [04:02<00:00,  3.33s/it]


üìä ANALYSIS COMPLETE
   ‚Ä¢ Total indexes processed: 73
   ‚Ä¢ Total .gz files: 67,502
   ‚Ä¢ Total estimated URLs: 675,020,000





Unnamed: 0,index_url,index_name,gz_file_count,estimated_urls
0,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_1_10000000.xml,990,9900000
1,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_10000001_20000000.xml,1000,10000000
2,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_20000001_30000000.xml,1000,10000000
3,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_30000001_40000000.xml,1000,10000000
4,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_40000001_50000000.xml,764,7640000
...,...,...,...,...
68,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_700000001_710000000.xml,1000,10000000
69,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_a_020000001_030000000.xml,1000,10000000
70,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_b_020000001_030000000.xml,999,9990000
71,https://www.alamy.com/sitemaps/image_daily_ind...,image_daily_index_s_c_020000001_030000000.xml,1000,10000000



üíæ Saved analysis to: sitemap_index_analysis.csv
