In [1]:
###
# @Author             : Monserrat López
# @Date               : 2025-03-12
# @Last Modified Date : 2025-04-21
# @Description        : Scraping pipeline for EU data centers from public sources.
# @Note               : This script is intended for academic research purposes only. 
#                       Some original raw data collected during the research is not included in this repository for confidentiality reasons.
###

In [3]:
# Imports 
import json
import pandas as pd
import csv
import os

In [6]:
# File paths
CACHE_FILE = "../cache/scrape_cache.json"
INPUT_CSV = "../output/02european_datacenters.csv"
OUTPUT_CSV = "../output/03data_centers_output.csv"

In [7]:
# Load cache
with open(CACHE_FILE, "r", encoding="utf-8") as f:
    cache = json.load(f)

# Load european_datacenters.csv
df_input = pd.read_csv(INPUT_CSV)

# Ensure column exists in input CSV
if "Datacenter URL" not in df_input.columns:
    raise ValueError("ERROR: 'Datacenter URL' column not found in european_datacenters.csv")

# Get list of URLs from european_datacenters.csv
urls_in_csv = set(df_input["Datacenter URL"])

# Filter cache to only keep matching URLs
filtered_data = [cache[url] for url in urls_in_csv if url in cache]

print(f"Cache contains {len(cache)} entries, but only {len(filtered_data)} match the input CSV.")

# Convert to DataFrame
df_filtered = pd.DataFrame(filtered_data)

# Ensure newlines are removed for CSV safety
df_filtered.replace("\n", " ", regex=True, inplace=True)

# Save to CSV
df_filtered.to_csv(OUTPUT_CSV, index=False, quoting=csv.QUOTE_ALL)

print("\nFiltered CSV saved with", len(df_filtered), "rows matching european_datacenters.csv.")


Cache contains 1795 entries, but only 1795 match the input CSV.

Filtered CSV saved with 1795 rows matching european_datacenters.csv.


In [11]:
df_filtered["Country"].value_counts().sort_values(ascending=False)

Country
germany           377
france            238
Netherlands       163
italy             141
spain             133
ireland            89
sweden             88
poland             77
romania            58
denmark            58
belgium            43
finland            42
austria            41
czech-republic     39
portugal           35
bulgaria           26
greece             18
slovenia           17
croatia            16
lithuania          16
cyprus             15
latvia             15
hungary            12
luxembourg         12
slovakia           10
malta               8
estonia             8
Name: count, dtype: int64

### Scraper

In [8]:
# Scapping datacenters
import time
import random
import json
import os
import re
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# --- USER-AGENT ROTATION ---
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
]

# Common block detection patterns
BLOCK_PATTERNS = [
    "activity has been limited",
    "access denied",
    "too many requests",
    "rate limit exceeded",
    "you have been blocked",
    "detected unusual activity",
    "captcha",
    "your request has been blocked",
    "please wait before trying again"
]

CACHE_FILE = "../cache/scrape_cache.json"
OUTPUT_CSV = "../output/03data_centers_output.csv"
RATE_LIMIT_LOG = "../cache/rate_limit_log.json"

# ============== LOADING AND SAVING CACHE ==================
def load_cache_from_disk(filename=CACHE_FILE):
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}

def save_cache_to_disk(cache_dict, filename=CACHE_FILE):
    # Ensure directory exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(cache_dict, f, ensure_ascii=False, indent=2)

# ============== ENHANCED RATE LIMITER WITH LONGER DELAYS =====================
class RateLimiter:
    def __init__(self, log_file=RATE_LIMIT_LOG):
        self.log_file = log_file
        self.domain_stats = self._load_stats()
        # Increased delays for better stealth
        self.base_delay = 30  # Increased from 15 to 30 seconds
        self.max_delay = 180  # Increased from 120 to 180 seconds (3 minutes max)
        self.backoff_factor = 1.5  # Increased from 1.3 to 1.5 for more aggressive backoff
        self.success_reduction = 0.9  # Slower reduction (10% instead of 15%)
        self.min_delay = 20  # Increased from 10 to 20 seconds
        
    def _load_stats(self):
        if os.path.exists(self.log_file):
            try:
                with open(self.log_file, 'r') as f:
                    return json.load(f)
            except json.JSONDecodeError:
                return {}
        return {}
    
    def _save_stats(self):
        os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
        with open(self.log_file, 'w') as f:
            json.dump(self.domain_stats, f, indent=2)
    
    def get_domain(self, url):
        match = re.search(r'https?://(?:www\.)?([^/]+)', url)
        if match:
            return match.group(1)
        return "unknown"
    
    def wait(self, url, success=True):
        domain = self.get_domain(url)
        
        # Initialize domain if not exists
        if domain not in self.domain_stats:
            self.domain_stats[domain] = {
                "current_delay": self.base_delay,
                "last_access": 0,
                "consecutive_errors": 0,
                "total_requests": 0,
                "successful_requests": 0
            }
        
        stats = self.domain_stats[domain]
        stats["total_requests"] += 1
        
        # Calculate wait time
        now = time.time()
        time_since_last = now - stats["last_access"]
        required_wait = max(0, stats["current_delay"] - time_since_last)
        
        # If successful, gradually reduce delay (but keep minimum)
        if success:
            stats["consecutive_errors"] = 0
            stats["successful_requests"] += 1
            # Only reduce delay after multiple successes
            if stats["successful_requests"] % 3 == 0:  # Every 3 successes
                stats["current_delay"] = max(self.min_delay, stats["current_delay"] * self.success_reduction)
        else:
            # If error, increase backoff
            stats["consecutive_errors"] += 1
            stats["current_delay"] = min(self.max_delay, stats["current_delay"] * self.backoff_factor)
        
        # Add more variable jitter to avoid patterns (±15-25%)
        jitter = random.uniform(0.85, 1.25)
        actual_wait = required_wait * jitter
        
        # Occasionally add an extra "human pause" (10% chance)
        if random.random() < 0.1:  # 10% chance
            extra_pause = random.randint(10, 30)  # 10-30 second extra pause
            print(f"[HUMAN PAUSE] Adding extra {extra_pause}s pause to seem more human-like")
            actual_wait += extra_pause
        
        print(f"[RATE LIMIT] {domain}: Waiting {actual_wait:.1f}s " + 
              f"(base delay: {stats['current_delay']:.1f}s, consecutive errors: {stats['consecutive_errors']})")
        
        # Actually wait
        time.sleep(actual_wait)
        
        # Update last access time after waiting
        stats["last_access"] = time.time()
        self._save_stats()
        
        return stats["consecutive_errors"]
    
    def report_error(self, url):
        """Report an error for a URL, increase backoff"""
        self.wait(url, success=False)
    
    def report_success(self, url):
        """Report a success for a URL, potentially decrease backoff"""
        self.wait(url, success=True)

# ============== IMPROVED BROWSER MANAGEMENT =====================
class BrowserManager:
    def __init__(self, max_requests_per_browser=3):  # Reduced from 15 to 3
        self.max_requests = max_requests_per_browser
        self.current_requests = 0
        self.driver = None
        self.create_new_driver()
    
    def create_new_driver(self):
        if self.driver:
            try:
                self.driver.quit()
            except:
                pass
        
        # Rotate user agents
        chosen_ua = random.choice(USER_AGENTS)
        print(f"Using User-Agent: {chosen_ua}")
        
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument(f"user-agent={chosen_ua}")
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        
        # Use a random window size for more variation
        width = random.randint(1280, 1920)
        height = random.randint(800, 1080)
        chrome_options.add_argument(f"--window-size={width},{height}")
        
        # Enhanced anti-detection measures
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option("useAutomationExtension", False)
        
        # Add random language preference
        languages = ["en-US,en;q=0.9", "en-GB,en;q=0.9", "en-CA,en;q=0.9,fr-CA;q=0.8", "en;q=0.9,de;q=0.8"]
        chrome_options.add_argument(f"--lang={random.choice(languages)}")
        
        # Create new driver
        self.driver = webdriver.Chrome(options=chrome_options)
        
        # Mask WebDriver properties  
        self.driver.execute_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
            
            // Overwrite the 'plugins' property to have fake plugins
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5]
            });
            
            // Overwrite the 'languages' property
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en', 'es']
            });
            
            // Add Chrome-specific objects
            window.chrome = {
                runtime: {},
                loadTimes: function() {},
                csi: function() {},
                app: {}
            };
        """)
        
        self.current_requests = 0
        return self.driver
    
    def get_driver(self):
        self.current_requests += 1
        if self.current_requests >= self.max_requests:
            print(f"[BROWSER] Rotating browser after {self.current_requests} requests")
            self.create_new_driver()
        return self.driver
    
    def randomize_scrolling(self):
        """Perform random human-like scrolling before extraction"""
        driver = self.driver
        try:
            # Get page height
            height = driver.execute_script("return document.body.scrollHeight")
            
            # Random number of scrolls
            scroll_count = random.randint(3, 7)
            
            # Perform random scrolling with pauses
            for i in range(scroll_count):
                # Random scroll amount
                scroll_y = random.randint(200, 500)
                
                # Random scrolling speed (simulate human behavior)
                driver.execute_script(f"window.scrollBy(0, {scroll_y});")
                
                # Random pause between scrolls (humans don't scroll continuously)
                time.sleep(random.uniform(0.5, 2.0))
                
                # Sometimes pause a bit longer as if reading
                if random.random() < 0.3:  # 30% chance
                    time.sleep(random.uniform(1.0, 3.0))
                    
            # Occasionally scroll back up a bit (like a human revisiting content)
            if random.random() < 0.4:  # 40% chance
                scroll_up = random.randint(100, 300)
                driver.execute_script(f"window.scrollBy(0, -{scroll_up});")
                time.sleep(random.uniform(0.5, 1.5))
                
            print("[HUMAN] Performed random human-like scrolling")
        except Exception as e:
            print(f"[SCROLL ERROR] Failed to perform scrolling: {e}")
    
    def close(self):
        if self.driver:
            try:
                self.driver.quit()
            except:
                pass
    
    def check_for_blocks(self, url):
        """Check if the page shows signs of being blocked"""
        try:
            # Get the page source and convert to lowercase
            page_source = self.driver.page_source.lower()
            
            # Check for common block messages
            for pattern in BLOCK_PATTERNS:
                if pattern in page_source:
                    print(f"[BLOCK DETECTED] '{pattern}' found on {url}. Sleeping 30 minutes...")
                    time.sleep(1800)  # 30 minutes
                    self.create_new_driver()  # Create a new browser session after waiting
                    return True
                    
            # Check for suspiciously short content (often a sign of being blocked)
            if len(page_source) < 1000 and "datacentermap" in url:
                print(f"[SUSPICIOUS] Page content unusually short ({len(page_source)} chars). Possible block on {url}")
                # Take a shorter pause for suspicious but not confirmed blocks
                time.sleep(300)  # 5 minutes
                return True
                
            return False
        except Exception as e:
            print(f"[ERROR] Error checking for blocks: {e}")
            return False

# ============== OPTIMIZED DESCRIPTION EXTRACTION WITH KEYWORDS ============
def extract_description(driver):
    """Keyword-based description extraction"""
    description = "Not Found"
    
    # Strategy 1: Look for specific keywords in content (from original scraper)
    try:
        # Look for containers with specific keywords
        desc_divs = driver.find_elements(By.CLASS_NAME, "ui.container")
        for div in desc_divs:
            txt = div.text.strip()
            if any(kw in txt for kw in ["Purpose-Built", "Data Centre", "Facility", "Campus", "Center", "Infrastructure", "Cloud", "Colocation"]):
                description = txt
                break
    except Exception as e:
        print(f"[KEYWORD SEARCH ERROR] {e}")
    
    # Strategy 2: Try to find text around specific sections (if strategy 1 fails)
    if description == "Not Found":
        try:
            all_divs = driver.find_elements(By.XPATH, "//div[contains(@class, 'ui container')]")
            found_text = []
            for div in all_divs:
                text = div.text.strip()
                # Look for sections with substantial text
                if len(text) > 100 and not re.search(r'PRICING & SERVICES|ADVERTISEMENT|Request Quote', text):
                    found_text.append(text)
            
            if found_text:
                # Use the longest text block found
                description = max(found_text, key=len)
        except Exception as e:
            print(f"[GENERAL SEARCH ERROR] {e}")
    
    # Clean up the description (combining patterns from both scrapers)
    if description != "Not Found":
        # Apply more specific cleanup patterns from the original scraper
        description = re.sub(r"© Mapbox.*?Improve this map", "", description, flags=re.DOTALL)
        description = re.sub(r"PRICING & SERVICES.*", "", description, flags=re.DOTALL)
        description = re.sub(r"ADVERTISEMENT.*", "", description, flags=re.DOTALL)
        description = re.sub(r"NEAREST DATA CENTERS.*", "", description, flags=re.DOTALL)
        
        # Additional cleanups from the current scraper
        description = re.sub(r"Suites\s+Cages\s+Private Cabinets\s+Partial Cabinets\s+Remote Hands", "", description)
        description = re.sub(r"Overview\s+Specs\s+Ecosystem\s+Location\s+Request Quote", "", description)
        description = re.sub(r"Data Centers\s+\d+", "", description)
        
        # Remove excessive whitespace
        description = re.sub(r"\s+", " ", description).strip()
    
    return description

# ============== STREAMLINED SCRAPING LOGIC WITH BLOCK DETECTION ============
def scrape_datacenter(url, browser_manager, rate_limiter, need_description=True, need_specs=True, existing_data=None):
    """
    Scrapes a single data center page with block detection.
    
    Parameters:
        url: The URL to scrape
        browser_manager: Browser manager instance
        rate_limiter: Rate limiter instance
        need_description: Whether to scrape the main page for description (default True)
        need_specs: Whether to scrape the specs page (default True)
        existing_data: Existing data to start with, if any (default None)
    """
    # Start with existing data if provided, otherwise create new data object
    data = existing_data.copy() if existing_data else {"URL": url}
    driver = browser_manager.get_driver()
    
    try:
        print(f"\n--- Scraping {url} ---")

        # Only scrape main page if we need description or it's a new record
        if need_description:
            # Get main page with rate limiting
            consecutive_errors = rate_limiter.wait(url)
            if consecutive_errors >= 8:  # More generous error threshold
                print(f"[ABORT] Too many consecutive errors for this domain. Skipping {url}")
                data["Status"] = "Error: Too many consecutive errors"
                return data
                
            success = False
            try:
                driver.get(url)
                time.sleep(random.uniform(2, 4))  # Variable wait time
                
                # Check for blocks
                if browser_manager.check_for_blocks(url):
                    data["Status"] = "Error: Blocked by website"
                    rate_limiter.report_error(url)
                    return data
                    
                success = True
            except Exception as e:
                print(f"[NAVIGATE ERROR] {url} -> {e}")
                rate_limiter.report_error(url)
                data["Status"] = f"Error: Navigation failed - {str(e)}"
                return data
                
            if success:
                rate_limiter.report_success(url)

            # Perform random human-like scrolling
            browser_manager.randomize_scrolling()

            # Extract Address
            try:
                address_element = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "sub.header"))
                )
                data["Address"] = address_element.text.strip()
            except Exception as e:
                print(f"[ADDRESS ERROR] {e}")
                data["Address"] = "Not Found"
            
            # Extract Description with keyword-based approach
            data["Description"] = extract_description(driver)

            # Extract Website
            try:
                website_element = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "ui.tiny.basic.icon.left.labeled.button"))
                )
                data["Website"] = website_element.get_attribute("href")
            except Exception as e:
                print(f"[WEBSITE ERROR] {e}")
                data["Website"] = "Not Found"
                
            print(f"[MAIN PAGE] Completed main page scraping for {url}")
        else:
            print(f"[SKIP MAIN] Skipping main page for {url} - description already found")

        # Only scrape specs page if we need to
        if need_specs:
            # Step 2: Extract Specs from a different page
            specs_url = url.rstrip('/') + '/specs/'
            
            # Additional rate limit wait for specs page
            consecutive_errors = rate_limiter.wait(specs_url)
            if consecutive_errors >= 8:
                print(f"[ABORT] Too many consecutive errors for this domain. Skipping specs for {url}")
                data["Specs"] = "Error: Too many consecutive errors"
            else:
                specs_success = False
                try:
                    driver.get(specs_url)
                    time.sleep(random.uniform(1, 3))  # Variable wait time
                    
                    # Check for blocks on specs page too
                    if browser_manager.check_for_blocks(specs_url):
                        data["Specs"] = "Error: Blocked by website"
                        rate_limiter.report_error(specs_url)
                    else:
                        specs_success = True
                except Exception as e:
                    print(f"[NAVIGATE ERROR] {specs_url} -> {e}")
                    rate_limiter.report_error(specs_url)
                    data["Specs"] = f"Error: Navigation failed - {str(e)}"
                    
                if specs_success:
                    rate_limiter.report_success(specs_url)
                    
                    # Random scrolling on specs page too
                    browser_manager.randomize_scrolling()
                    
                    try:
                        capacity_table = WebDriverWait(driver, 5).until(
                            EC.presence_of_element_located((By.XPATH, "//table[@class='ui striped very basic table']"))
                        )
                        rows = capacity_table.find_elements(By.TAG_NAME, "tr")
                        specs_data = [" ".join([c.text.strip() for c in row.find_elements(By.TAG_NAME, "td") if c.text.strip()]) for row in rows]
                        data["Specs"] = "; ".join(specs_data) if specs_data else "Not Found"
                    except Exception as e:
                        print(f"[SPECS ERROR] {e}")
                        data["Specs"] = "Not Found"
                        
            print(f"[SPECS PAGE] Completed specs page scraping for {url}")
        else:
            print(f"[SKIP SPECS] Skipping specs page for {url} - specs already found")

        data["Status"] = "Success"

    except Exception as e:
        data["Status"] = f"Error: {str(e)}"
        print("[SCRAPE ERROR]", e)

    return data

# ============== UPDATE MAIN OUTPUT CSV =====================
def update_main_csv(data, output_csv=OUTPUT_CSV):
    """Append a single entry to the main CSV file."""
    try:
        # Ensure directory exists
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)
        
        # Check if file exists and create with headers if not
        file_exists = os.path.isfile(output_csv)
        
        # Create DataFrame from single entry
        df = pd.DataFrame([data])
        
        # Remove newlines which can break CSV format
        df = df.replace("\n", " ", regex=True)
        
        # Write to CSV - append if file exists, create with headers if not
        mode = 'a' if file_exists else 'w'
        header = not file_exists
        
        df.to_csv(output_csv, mode=mode, header=header, index=False, quoting=csv.QUOTE_ALL)
        print(f"[CSV] Added entry for {data['URL']} to main output file")
        return True
    except Exception as e:
        print(f"[ERROR] Failed to update CSV: {e}")
        return False

# ============== REBUILD FINAL CSV FROM CACHE =====================
def rebuild_csv_from_cache(cache, valid_urls, output_csv=OUTPUT_CSV):
    """Rebuild the CSV completely from cache data."""
    try:
        # Create a new DataFrame with proper ordering
        columns = ["URL", "Address", "Description", "Website", "Specs", "Status", "Country", "City", "DatacenterName"]
        
        # Only include URLs that were in our input CSV, with each URL appearing exactly once
        filtered_data = []
        processed_urls = set()
        
        for url in valid_urls:
            if url in cache and url not in processed_urls:
                filtered_data.append(cache[url])
                processed_urls.add(url)
        
        final_df = pd.DataFrame(filtered_data, columns=columns)
        
        # Remove newlines which can break CSV format
        final_df = final_df.replace("\n", " ", regex=True)
        
        # Write to CSV directly without appending
        final_df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL, mode='w')
        print(f"[CSV] Rebuilt main output file with {len(final_df)} entries")
        return True
    except Exception as e:
        print(f"[ERROR] Failed to rebuild CSV: {e}")
        return False

# ============== MAIN EXECUTION ==============================
def main():
    # Make sure cache and output directories exist
    os.makedirs("../cache", exist_ok=True)
    os.makedirs("../output", exist_ok=True)
    
    # Initialize rate limiter
    rate_limiter = RateLimiter()
    
    # Load input CSV first to get valid URLs
    input_csv = "../output/02european_datacenters.csv"
    try:
        df_input = pd.read_csv(input_csv)
        valid_urls = set(df_input["Datacenter URL"])
        print(f"Loaded {len(valid_urls)} valid URLs from input CSV.")
    except Exception as e:
        print(f"Error loading input CSV: {e}")
        return
    
    # Load cache and filter to only keep entries that match valid URLs
    full_cache = load_cache_from_disk()
    if full_cache:
        # Filter cache to only include URLs from the input CSV
        cache = {url: data for url, data in full_cache.items() if url in valid_urls}
        print(f"Loaded {len(full_cache)} entries from cache, filtered to {len(cache)} matching input CSV.")
        cache_stats = analyze_cache_quality(cache)
    else:
        cache = {}
        print("No existing cache found. Starting fresh.")
    
    # Use the already loaded input CSV
    all_records = df_input.to_dict(orient='records')
    total = len(all_records)
    
    # Options for reprocessing
    rescrape_not_found = True  # Set to True to rescrape entries with "Not Found" descriptions
    start_from_beginning = False  # Set to False to continue where left off
    preserve_good_cache = True  # Set to True to preserve entries that already have good data
    chunk_size = 500  # For better stealth
    resume_index = 0  # Change this to resume from a specific point
    
    # Mark which URLs need to be rescraped (for missing descriptions and/or specs)
    urls_to_rescrape_description = set()
    urls_to_rescrape_specs = set()
    
    if rescrape_not_found:
        for url, data in cache.items():
            # Check for missing descriptions
            if data.get("Description", "") == "Not Found":
                urls_to_rescrape_description.add(url)
                
            # Check for missing specs
            if data.get("Specs", "") in ["Not Found", "", "Not Attempted - Description Not Found"]:
                urls_to_rescrape_specs.add(url)
    
        print(f"[INFO] Marked {len(urls_to_rescrape_description)} URLs with missing descriptions for rescraping")
        print(f"[INFO] Marked {len(urls_to_rescrape_specs)} URLs with missing specs for rescraping")

    # Initialize browser manager with reduced requests per browser
    browser_manager = BrowserManager(max_requests_per_browser=3)
    
    try:
        # Ensure CSV file is created with headers if it doesn't exist
        if not os.path.exists(OUTPUT_CSV):
            # Create an empty DataFrame with the correct columns
            columns = ["URL", "Address", "Description", "Website", "Specs", "Status", "Country", "City", "DatacenterName"]
            pd.DataFrame(columns=columns).to_csv(OUTPUT_CSV, index=False, quoting=csv.QUOTE_ALL)
            print(f"Created new output CSV file: {OUTPUT_CSV}")
        
        # Start from specified index
        i = 0 if start_from_beginning else resume_index
        
        # Process in chunks with moderate pauses
        while i < total:
            chunk_end = min(i + chunk_size, total)
            print(f"\n[CHUNK] Starting chunk from {i} to {chunk_end}...")
            
            chunk_records = all_records[i : chunk_end]
            processed_in_chunk = 0
            block_detected = False
            
            # Randomize the order of URLs within the chunk for added stealth
            if random.random() < 0.7:  # 70% chance to randomize
                random.shuffle(chunk_records)
                print("[STEALTH] Randomized URL order within chunk")
            
            for rec in chunk_records:
                url = rec["Datacenter URL"]
                
                # Skip processing if a block was detected in this chunk
                if block_detected:
                    print(f"[SKIP] {url} - Skipping due to earlier block detection")
                    continue
                
                # Check if URL needs to be processed
                need_description = False
                need_specs = False
                
                if url in cache:
                    # Check if we need to get description, specs, or both
                    if url in urls_to_rescrape_description:
                        need_description = True
                        print(f"[RESCRAPE DESC] {url} - Previous description was not found")
                    
                    if url in urls_to_rescrape_specs:
                        need_specs = True
                        print(f"[RESCRAPE SPECS] {url} - Previous specs were not found")
                    
                    # If both found and preserve_good_cache is true, skip entirely
                    if preserve_good_cache and not need_description and not need_specs:
                        print(f"[CACHE HIT] {url} - Using existing complete data")
                        continue
                    
                    # If we're not preserving cache, rescrape everything
                    if not preserve_good_cache:
                        need_description = True
                        need_specs = True
                        print(f"[RESCRAPE] {url} - Rescraping all entries")
                else:
                    # New URL - need to scrape everything
                    need_description = True
                    need_specs = True
                    print(f"[NEW URL] {url} - Not in cache")
                
                # Get existing data if available
                existing_data = cache.get(url, {"URL": url})
                
                # Scrape the data center with selective scraping
                data = scrape_datacenter(
                    url, 
                    browser_manager, 
                    rate_limiter,
                    need_description=need_description,
                    need_specs=need_specs,
                    existing_data=existing_data
                )
                
                processed_in_chunk += 1
                
                # Check if we were blocked
                if "Error: Blocked by website" in data["Status"]:
                    block_detected = True
                    print("[BLOCK] Website blocking detected. Pausing this chunk.")
                
                # Add extra columns
                data.update({
                    "Country": rec.get("Country", ""),
                    "City": rec.get("City", ""),
                    "DatacenterName": rec.get("Datacenter Name", "")
                })
                
                # Update the cache
                cache[url] = data
                
                # Save to main output CSV directly
                update_main_csv(data)
                
                # Save cache frequently
                if processed_in_chunk % 2 == 0:  # Save after every 2 URLs
                    save_cache_to_disk(cache)
                
                # If we hit a rate limit, adjust waiting time moderately
                if "Error: Too many consecutive errors" in data["Status"]:
                    print("[RATE LIMIT] Detected rate limiting, pausing for longer...")
                    time.sleep(random.randint(300, 600))  # 5-10 minute pause
            
            # Add a moderate pause between chunks
            if processed_in_chunk > 0:
                if block_detected:
                    # Longer pause if a block was detected
                    pause_time = random.randint(1800, 3600)  # 30-60 minute pause
                    print(f"[BLOCK RECOVERY] Pausing for {pause_time/60:.1f} minutes to recover from block...")
                else:
                    # Variable pause between chunks
                    if random.random() < 0.2:  # 20% chance for a long break
                        pause_time = random.randint(600, 900)  # 10-15 minute long pause
                        print(f"[LONG BREAK] Taking an extended break of {pause_time/60:.1f} minutes")
                    else:
                        pause_time = random.randint(120, 300)  # 2-5 minute normal pause
                        print(f"[CHUNK] Processed {processed_in_chunk} items. Pausing for {pause_time} seconds before next chunk...")
                
                time.sleep(pause_time)
            
            i = chunk_end
            print(f"[PROGRESS] Completed {min(i, total)} of {total} URLs.")
            
            # Save cache after each chunk
            save_cache_to_disk(cache)
    
    finally:
        # Always close the browser manager
        browser_manager.close()
        
        # Rebuild CSV from cache to ensure consistency
        print("[DONE] Rebuilding final CSV from cache to ensure all entries are included...")
        rebuild_csv_from_cache(cache, valid_urls)
        
        # Final analysis
        print("\n[FINAL STATS]")
        final_stats = analyze_cache_quality(cache)
        print(f"Successfully scraped {final_stats['total']} data centers.")
        
        # Summary of improvements if we started with an existing cache
        if 'cache_stats' in locals():
            initial_missing = cache_stats['missing_descriptions']
            final_missing = final_stats['missing_descriptions']
            if initial_missing > final_missing:
                print(f"Improved descriptions: {initial_missing - final_missing} entries now have descriptions that were previously missing.")
            
        print("\nSample of entries:")
        pd.set_option('display.max_colwidth', 50)  # Limit column width for display
        
        # Read from the final CSV to show sample
        try:
            final_df = pd.read_csv(OUTPUT_CSV)
            print(final_df[['URL', 'Description']].head(5))
        except Exception as e:
            print(f"Error reading final CSV: {e}")

def analyze_cache_quality(cache):
    """Analyzes the cache to report on quality and completeness."""
    total = len(cache)
    missing_descriptions = sum(1 for data in cache.values() if data.get("Description", "") == "Not Found")
    missing_specs = sum(1 for data in cache.values() if data.get("Specs", "") == "Not Found")
    errors = sum(1 for data in cache.values() if "Error" in data.get("Status", ""))
    
    print(f"\n[CACHE ANALYSIS]")
    print(f"Total entries: {total}")
    print(f"Missing descriptions: {missing_descriptions} ({missing_descriptions/total*100:.1f}%)")
    print(f"Missing specs: {missing_specs} ({missing_specs/total*100:.1f}%)")
    print(f"Entries with errors: {errors} ({errors/total*100:.1f}%)")
    print(f"Complete entries: {total - missing_descriptions - missing_specs - errors} ({(total - missing_descriptions - missing_specs - errors)/total*100:.1f}%)")
    
    return {
        "total": total,
        "missing_descriptions": missing_descriptions,
        "missing_specs": missing_specs,
        "errors": errors
    }

if __name__ == "__main__":
    main()

Loaded 1795 valid URLs from input CSV.
Loaded 1805 entries from cache, filtered to 1795 matching input CSV.

[CACHE ANALYSIS]
Total entries: 1795
Missing descriptions: 0 (0.0%)
Missing specs: 2 (0.1%)
Entries with errors: 0 (0.0%)
Complete entries: 1793 (99.9%)
[INFO] Marked 0 URLs with missing descriptions for rescraping
[INFO] Marked 2 URLs with missing specs for rescraping
Using User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36

[CHUNK] Starting chunk from 0 to 500...
[STEALTH] Randomized URL order within chunk
[CACHE HIT] https://www.datacentermap.com/belgium/brussels/colt-brussels/ - Using existing complete data
[CACHE HIT] https://www.datacentermap.com/greece/athens/data4-athens/ - Using existing complete data
[CACHE HIT] https://www.datacentermap.com/italy/milan/colt-milan1/ - Using existing complete data
[CACHE HIT] https://www.datacentermap.com/denmark/skanderborg/datacenter-skanderborg/ - Using exis