In [3]:
#!/usr/bin/env python3
"""
StarTech COMPLETE Product Scraper - All Categories & Subcategories
Advanced anti-blocking techniques with ThreadPool optimization
Compatible with Google Colab and Crawl4AI 0.7.4+
Author: Professional Web Scraper
Date: 2025
"""

# Google Colab Setup with enhanced packages
import sys
import subprocess
import importlib
import warnings
warnings.filterwarnings('ignore')

def install_and_import(package):
    """Install package if not available"""
    try:
        importlib.import_module(package.split('==')[0].split('>=')[0])
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])

# Enhanced package list for complete scraping
required_packages = [
    "crawl4ai>=0.7.4",
    "beautifulsoup4",
    "pandas",
    "nest-asyncio",
    "aiohttp",
    "lxml",
    "fake-useragent",
    "requests",
    "asyncio-throttle",
    "tqdm"
]

print("🔧 Setting up COMPLETE scraping environment...")
for package in required_packages:
    install_and_import(package)

import asyncio
import json
import pandas as pd
import time
import random
from typing import Dict, List, Optional, Tuple, Any, Set
from dataclasses import dataclass, asdict, field
from urllib.parse import urljoin, urlparse
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from queue import Queue
import os
from datetime import datetime
import hashlib

# Enhanced imports
from bs4 import BeautifulSoup
import nest_asyncio
from fake_useragent import UserAgent
from asyncio_throttle import Throttler
from tqdm.auto import tqdm
import requests.adapters
from requests.packages.urllib3.util.retry import Retry

try:
    from crawl4ai import AsyncWebCrawler
    from crawl4ai.extraction_strategy import LLMExtractionStrategy, LLMConfig
    from crawl4ai.chunking_strategy import RegexChunking
    print("✅ Enhanced Crawl4AI imports successful")
except ImportError as e:
    print(f"Installing/upgrading crawl4ai...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "crawl4ai", "--upgrade", "--quiet"])
    from crawl4ai import AsyncWebCrawler
    from crawl4ai.extraction_strategy import LLMExtractionStrategy, LLMConfig

# Apply nest_asyncio for Jupyter/Colab compatibility
nest_asyncio.apply()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class ScrapingStats:
    """Statistics tracking for scraping progress"""
    start_time: float = field(default_factory=time.time)
    categories_processed: int = 0
    subcategories_processed: int = 0
    products_scraped: int = 0
    pages_scraped: int = 0
    errors_encountered: int = 0
    blocked_requests: int = 0

    def get_elapsed_time(self) -> float:
        return time.time() - self.start_time

    def get_rate(self) -> float:
        elapsed = self.get_elapsed_time()
        return self.products_scraped / elapsed if elapsed > 0 else 0

@dataclass
class SubCategory:
    """Enhanced subcategory structure"""
    name: str
    url: str
    level: int
    parent: str
    children: List['SubCategory'] = field(default_factory=list)
    product_count: int = 0
    scraped: bool = False
    priority: int = 1  # 1=high, 2=medium, 3=low

@dataclass
class Category:
    """Enhanced main category structure"""
    name: str
    url: str
    level: int
    subcategories: List[SubCategory] = field(default_factory=list)
    product_count: int = 0
    scraped: bool = False
    priority: int = 1

@dataclass
class Product:
    """Enhanced product structure"""
    name: str
    price: str
    original_price: str
    discount: str
    model: str
    brand: str
    availability: str
    rating: str
    review_count: str
    image_url: str
    product_url: str
    specifications: Dict[str, Any]
    category: str
    subcategory: str
    description: str
    scraped_at: str = field(default_factory=lambda: datetime.now().isoformat())
    page_number: int = 1

class AdvancedAntiBlockSystem:
    """Advanced anti-blocking system with multiple strategies"""

    def __init__(self):
        self.ua = UserAgent()
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
        ]
        self.current_ua_index = 0
        self.request_count = 0
        self.last_request_time = 0
        self.blocked_count = 0

        # Dynamic delay system
        self.base_delay = 1.0
        self.max_delay = 10.0
        self.delay_increment = 0.5
        self.current_delay = self.base_delay

        # IP rotation simulation
        self.session_headers = self._generate_session_headers()

    def _generate_session_headers(self) -> Dict[str, str]:
        """Generate realistic session headers"""
        return {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0',
            'DNT': '1',
        }

    def get_headers(self) -> Dict[str, str]:
        """Get rotating headers with user agent"""
        headers = self.session_headers.copy()
        headers['User-Agent'] = self.user_agents[self.current_ua_index]

        # Rotate user agent every 10 requests
        self.request_count += 1
        if self.request_count % 10 == 0:
            self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)

        return headers

    def get_delay(self, is_error: bool = False) -> float:
        """Get adaptive delay based on response"""
        if is_error:
            self.blocked_count += 1
            self.current_delay = min(self.current_delay + self.delay_increment, self.max_delay)
            logger.warning(f"Error detected, increasing delay to {self.current_delay}s")
        else:
            # Gradually reduce delay on success
            self.current_delay = max(self.current_delay - 0.1, self.base_delay)

        # Add random jitter
        jitter = random.uniform(0.5, 1.5)
        return self.current_delay * jitter

    def should_pause(self) -> bool:
        """Check if we should pause based on request patterns"""
        current_time = time.time()

        # Pause if too many requests in short time
        if self.request_count % 50 == 0:
            logger.info(f"Pausing after {self.request_count} requests...")
            return True

        # Pause if too many blocks
        if self.blocked_count > 5:
            logger.warning(f"Too many blocks ({self.blocked_count}), taking longer pause...")
            self.blocked_count = 0  # Reset counter
            return True

        return False

    def get_pause_duration(self) -> float:
        """Get pause duration for rate limiting"""
        if self.blocked_count > 0:
            return random.uniform(30, 60)  # 30-60 second pause for blocks
        return random.uniform(10, 20)  # Normal pause

class StarTechCompleteExtractor:
    """Enhanced category extractor for complete scraping"""

    def __init__(self, base_url: str = "https://www.startech.com.bd"):
        self.base_url = base_url
        self.anti_block = AdvancedAntiBlockSystem()
        self.stats = ScrapingStats()
        self.all_categories = []
        self.flat_category_list = []

    async def extract_all_categories_complete(self) -> Dict[str, Any]:
        """Extract ALL categories and subcategories with anti-blocking"""
        logger.info("🔍 Starting COMPLETE category extraction...")

        crawler_config = {
            'headless': True,
            'browser_type': 'chromium',
            'verbose': False,
            'user_agent': self.anti_block.get_headers()['User-Agent']
        }

        async with AsyncWebCrawler(**crawler_config) as crawler:
            try:
                # Add delay before first request
                await asyncio.sleep(self.anti_block.get_delay())

                result = await crawler.arun(
                    url=self.base_url,
                    word_count_threshold=10,
                    bypass_cache=True,
                    wait_for="css:nav, .navigation, .navbar",
                    delay_before_return_html=3.0,
                    headers=self.anti_block.get_headers()
                )

                if result.success:
                    soup = BeautifulSoup(result.html, 'html.parser')
                    categories = await self._parse_complete_navigation(soup)

                    logger.info(f"✅ Extracted {len(categories)} main categories")
                    return self._build_complete_hierarchy(categories)
                else:
                    logger.error(f"❌ Failed to crawl main page: {result.error_message}")
                    return {}

            except Exception as e:
                logger.error(f"❌ Error during category extraction: {e}")
                return {}

    async def _parse_complete_navigation(self, soup: BeautifulSoup) -> List[Category]:
        """Parse ALL navigation categories with enhanced detection"""
        categories = []

        # Enhanced selectors for StarTech
        nav_selectors = [
            'li.nav-item.has-child',
            '.main-menu .nav-item.has-child',
            '.navbar .nav-item.has-child',
            'nav li.has-child',
            '.navigation li.has-child',
            'ul.menu > li.has-child'
        ]

        main_nav_items = None
        for selector in nav_selectors:
            main_nav_items = soup.select(selector)
            if main_nav_items:
                logger.info(f"✅ Found {len(main_nav_items)} categories using: {selector}")
                break

        if not main_nav_items:
            logger.warning("⚠️ No navigation found, trying alternative extraction...")
            # Try alternative methods
            all_links = soup.find_all('a', href=True)
            category_urls = set()

            for link in all_links:
                href = link.get('href', '')
                text = link.get_text(strip=True)

                # Look for category patterns in URLs
                if any(pattern in href.lower() for pattern in ['/category/', '/desktops/', '/laptops/', '/components/']):
                    if text and len(text) > 2:
                        category_urls.add((text, urljoin(self.base_url, href)))

            # Convert to Category objects
            for name, url in list(category_urls)[:50]:  # Limit to prevent overload
                category = Category(name=name, url=url, level=1, subcategories=[])
                categories.append(category)

            return categories

        # Process each main category
        for i, nav_item in enumerate(main_nav_items):
            try:
                category = await self._extract_complete_category(nav_item, i+1)
                if category:
                    categories.append(category)

                # Add small delay between categories
                await asyncio.sleep(0.5)

            except Exception as e:
                logger.error(f"❌ Error processing category {i+1}: {e}")
                continue

        return categories

    async def _extract_complete_category(self, nav_item, priority: int = 1) -> Optional[Category]:
        """Extract complete category with all subcategory levels"""
        try:
            main_link = nav_item.find('a', class_='nav-link') or nav_item.find('a')
            if not main_link:
                return None

            category_name = main_link.get_text(strip=True)
            category_url = urljoin(self.base_url, main_link.get('href', ''))

            if not category_name or len(category_name.strip()) < 2:
                return None

            category = Category(
                name=category_name,
                url=category_url,
                level=1,
                subcategories=[],
                priority=priority
            )

            # Extract ALL subcategory levels
            dropdown_menu = nav_item.find('ul', class_='drop-down')
            if dropdown_menu:
                subcategories = await self._extract_all_subcategory_levels(
                    dropdown_menu, category_name, level=2
                )
                category.subcategories = subcategories

                # Count total subcategories
                total_subs = self._count_all_subcategories(subcategories)
                logger.info(f"📁 {category_name}: {total_subs} total subcategories")

            return category

        except Exception as e:
            logger.error(f"❌ Error extracting category: {e}")
            return None

    async def _extract_all_subcategory_levels(self, menu_element, parent_name: str, level: int) -> List[SubCategory]:
        """Recursively extract ALL subcategory levels"""
        subcategories = []

        nav_items = menu_element.find_all('li', class_='nav-item', recursive=False)
        if not nav_items:
            nav_items = menu_element.find_all('li', recursive=False)

        for i, nav_item in enumerate(nav_items):
            link = nav_item.find('a', class_='nav-link') or nav_item.find('a')
            if not link:
                continue

            name = link.get_text(strip=True)
            url = urljoin(self.base_url, link.get('href', ''))

            # Skip unwanted links
            if any(skip in name.lower() for skip in ['show all', 'see all', 'view all']) or len(name.strip()) < 2:
                continue

            subcategory = SubCategory(
                name=name,
                url=url,
                level=level,
                parent=parent_name,
                children=[],
                priority=i+1
            )

            # Extract deeper levels (up to level 5 to prevent infinite recursion)
            nested_menu = nav_item.find('ul', class_='drop-down')
            if nested_menu and level < 5:
                children = await self._extract_all_subcategory_levels(
                    nested_menu, name, level + 1
                )
                subcategory.children = children

            subcategories.append(subcategory)

        return subcategories

    def _count_all_subcategories(self, subcategories: List[SubCategory]) -> int:
        """Count all subcategories recursively"""
        total = len(subcategories)
        for sub in subcategories:
            total += self._count_all_subcategories(sub.children)
        return total

    def _build_complete_hierarchy(self, categories: List[Category]) -> Dict[str, Any]:
        """Build complete hierarchy with flattened list for processing"""
        hierarchy = {
            'extraction_timestamp': datetime.now().isoformat(),
            'total_main_categories': len(categories),
            'categories': [],
            'flat_processing_list': [],
            'summary': {}
        }

        for category in categories:
            cat_dict = {
                'name': category.name,
                'url': category.url,
                'level': category.level,
                'priority': category.priority,
                'subcategories': self._subcategories_to_dict(category.subcategories),
                'total_subcategories': self._count_all_subcategories(category.subcategories)
            }

            hierarchy['categories'].append(cat_dict)

            # Add to flat processing list
            hierarchy['flat_processing_list'].append({
                'name': category.name,
                'url': category.url,
                'level': 1,
                'parent': None,
                'priority': category.priority,
                'type': 'main_category'
            })

            # Add all subcategories to flat list
            self._add_all_to_flat_list(
                category.subcategories, hierarchy['flat_processing_list'], category.name
            )

        # Generate comprehensive summary
        hierarchy['summary'] = self._generate_complete_summary(hierarchy)

        # Cache for later use
        self.all_categories = categories
        self.flat_category_list = hierarchy['flat_processing_list']

        return hierarchy

    def _subcategories_to_dict(self, subcategories: List[SubCategory]) -> List[Dict]:
        """Convert subcategories to dictionary format"""
        result = []
        for sub in subcategories:
            sub_dict = {
                'name': sub.name,
                'url': sub.url,
                'level': sub.level,
                'parent': sub.parent,
                'priority': sub.priority,
                'children': self._subcategories_to_dict(sub.children) if sub.children else []
            }
            result.append(sub_dict)
        return result

    def _add_all_to_flat_list(self, subcategories: List[SubCategory], flat_list: List, parent: str):
        """Add all subcategories to flat processing list"""
        for sub in subcategories:
            flat_list.append({
                'name': sub.name,
                'url': sub.url,
                'level': sub.level,
                'parent': parent,
                'priority': sub.priority,
                'type': 'subcategory'
            })

            if sub.children:
                self._add_all_to_flat_list(sub.children, flat_list, sub.name)

    def _generate_complete_summary(self, hierarchy: Dict) -> Dict:
        """Generate comprehensive summary statistics"""
        flat_list = hierarchy['flat_processing_list']

        # Level distribution
        level_counts = {}
        type_counts = {}
        priority_distribution = {}

        for item in flat_list:
            level = item['level']
            item_type = item.get('type', 'unknown')
            priority = item.get('priority', 1)

            level_counts[f'level_{level}'] = level_counts.get(f'level_{level}', 0) + 1
            type_counts[item_type] = type_counts.get(item_type, 0) + 1
            priority_distribution[f'priority_{priority}'] = priority_distribution.get(f'priority_{priority}', 0) + 1

        return {
            'total_items_to_scrape': len(flat_list),
            'level_distribution': level_counts,
            'type_distribution': type_counts,
            'priority_distribution': priority_distribution,
            'deepest_level': max([item['level'] for item in flat_list]) if flat_list else 0,
            'estimated_scraping_time_hours': len(flat_list) * 2 / 3600  # Rough estimate
        }

class StarTechCompleteProductScraper:
    """Complete product scraper with advanced threading and anti-blocking"""

    def __init__(self, base_url: str = "https://www.startech.com.bd", max_pages_per_category: int = 10):
        self.base_url = base_url
        self.anti_block = AdvancedAntiBlockSystem()
        self.stats = ScrapingStats()
        self.max_pages = max_pages_per_category
        self.products = []
        self.failed_urls = []

        # Threading configuration
        self.max_workers = 3  # Conservative to avoid blocking
        self.request_semaphore = asyncio.Semaphore(2)  # Limit concurrent requests

        # Progress tracking
        self.progress_bar = None
        self.completed_categories = set()

    async def scrape_all_categories_complete(self, category_list: List[Dict]) -> List[Product]:
        """Scrape products from ALL categories with advanced management"""
        logger.info(f"🚀 Starting COMPLETE product scraping from {len(category_list)} categories...")

        # Sort by priority for better results early
        sorted_categories = sorted(category_list, key=lambda x: (x.get('priority', 999), x.get('level', 1)))

        # Initialize progress bar
        self.progress_bar = tqdm(total=len(sorted_categories), desc="Categories", unit="cat")

        # Use semaphore to limit concurrent requests
        tasks = []
        for category_data in sorted_categories:
            task = self._scrape_category_with_limits(category_data)
            tasks.append(task)

        # Process in batches to avoid overwhelming the server
        batch_size = 5
        all_results = []

        for i in range(0, len(tasks), batch_size):
            batch = tasks[i:i+batch_size]

            logger.info(f"🔄 Processing batch {i//batch_size + 1}/{(len(tasks)-1)//batch_size + 1}")

            # Process batch
            batch_results = await asyncio.gather(*batch, return_exceptions=True)

            # Collect results
            for result in batch_results:
                if isinstance(result, Exception):
                    logger.error(f"❌ Batch error: {result}")
                    self.stats.errors_encountered += 1
                elif isinstance(result, list):
                    all_results.extend(result)

            # Pause between batches for anti-blocking
            if i + batch_size < len(tasks):
                pause_time = self.anti_block.get_pause_duration()
                logger.info(f"⏸️ Pausing {pause_time:.1f}s between batches...")
                await asyncio.sleep(pause_time)

        self.progress_bar.close()

        # Filter and deduplicate results
        self.products = self._deduplicate_products(all_results)

        logger.info(f"✅ Complete scraping finished: {len(self.products)} products")
        return self.products

    async def _scrape_category_with_limits(self, category_data: Dict) -> List[Product]:
        """Scrape single category with rate limiting and error handling"""
        async with self.request_semaphore:
            try:
                category_name = category_data.get('name', 'Unknown')
                category_url = category_data.get('url', '')
                parent = category_data.get('parent', '')

                if not category_url:
                    return []

                # Skip if already processed
                cache_key = hashlib.md5(category_url.encode()).hexdigest()[:8]
                if cache_key in self.completed_categories:
                    self.progress_bar.update(1)
                    return []

                logger.info(f"🛍️ Scraping: {category_name}")

                products = await self._scrape_category_products_advanced(
                    category_url, category_name, parent
                )

                # Update statistics
                self.stats.categories_processed += 1
                self.stats.products_scraped += len(products)

                # Mark as completed
                self.completed_categories.add(cache_key)
                self.progress_bar.update(1)

                # Update progress bar description
                rate = self.stats.get_rate()
                self.progress_bar.set_description(f"Categories ({rate:.1f} products/min)")

                return products

            except Exception as e:
                logger.error(f"❌ Error scraping {category_data.get('name', 'Unknown')}: {e}")
                self.stats.errors_encountered += 1
                self.progress_bar.update(1)
                return []

    async def _scrape_category_products_advanced(self, category_url: str,
                                               category_name: str, subcategory_name: str = "") -> List[Product]:
        """Advanced product scraping with anti-blocking"""
        products = []

        crawler_config = {
            'headless': True,
            'browser_type': 'chromium',
            'verbose': False
        }

        async with AsyncWebCrawler(**crawler_config) as crawler:
            for page in range(1, self.max_pages + 1):
                try:
                    # Check if we should pause
                    if self.anti_block.should_pause():
                        pause_time = self.anti_block.get_pause_duration()
                        logger.info(f"⏸️ Strategic pause: {pause_time:.1f}s")
                        await asyncio.sleep(pause_time)

                    # Get delay and headers
                    delay = self.anti_block.get_delay()
                    headers = self.anti_block.get_headers()

                    # Construct page URL
                    page_url = self._construct_page_url_advanced(category_url, page)

                    # Add delay before request
                    await asyncio.sleep(delay)

                    # Make request
                    result = await crawler.arun(
                        url=page_url,
                        word_count_threshold=10,
                        bypass_cache=True,
                        wait_for="css:.p-item, .product-item, .product-card",
                        delay_before_return_html=2.0,
                        headers=headers,
                        timeout=30
                    )

                    if result.success:
                        # Check for blocking indicators
                        if self._is_blocked_response(result.html):
                            logger.warning(f"🚫 Possible blocking detected for {category_name}")
                            self.stats.blocked_requests += 1
                            await asyncio.sleep(self.anti_block.get_delay(is_error=True))
                            continue

                        # Parse products
                        soup = BeautifulSoup(result.html, 'html.parser')
                        page_products = self._extract_products_advanced(
                            soup, category_name, subcategory_name, page
                        )

                        if not page_products:
                            logger.info(f"📄 No products found on page {page} for {category_name}")
                            break

                        products.extend(page_products)
                        self.stats.pages_scraped += 1

                        # Check for next page
                        if not self._has_next_page_advanced(soup):
                            break

                    else:
                        logger.error(f"❌ Failed to load page {page} for {category_name}: {result.error_message}")
                        self.stats.errors_encountered += 1

                        # Increase delay on error
                        await asyncio.sleep(self.anti_block.get_delay(is_error=True))

                        # Break after too many failures
                        if self.stats.errors_encountered % 10 == 0:
                            break

                except Exception as e:
                    logger.error(f"❌ Error on page {page} for {category_name}: {e}")
                    self.stats.errors_encountered += 1
                    await asyncio.sleep(5)  # Error recovery delay
                    continue

        return products

    def _construct_page_url_advanced(self, base_url: str, page: int) -> str:
        """Advanced URL construction with multiple patterns"""
        if page == 1:
            return base_url

        # Multiple pagination patterns to try
        patterns = [
            f"{base_url}?page={page}",
            f"{base_url}&page={page}",
            f"{base_url}/page/{page}",
            f"{base_url}?p={page}",
            f"{base_url}&p={page}",
            f"{base_url}?offset={((page-1) * 20)}"  # Offset-based pagination
        ]

        return patterns[0]  # Start with most common

    def _is_blocked_response(self, html: str) -> bool:
        """Detect if response indicates blocking"""
        blocking_indicators = [
            'blocked', 'captcha', 'rate limit', 'too many requests',
            'access denied', 'forbidden', 'cloudflare', 'ddos protection'
        ]

        html_lower = html.lower()
        return any(indicator in html_lower for indicator in blocking_indicators)

    def _extract_products_advanced(self, soup: BeautifulSoup, category: str,
                                 subcategory: str, page: int) -> List[Product]:
        """Advanced product extraction with multiple fallback strategies"""
        products = []

        # Enhanced StarTech selectors with priority order
        product_selectors = [
            '.p-item',                    # Primary StarTech selector
            '.product-item',              # Secondary selector
            '.main-product',              # Alternative layout
            '.product-layout',            # Grid layout
            '.product-card',              # Card layout
            '.item-product',              # Item-based layout
            '[data-product-id]',          # Data attribute
            '.product-box',               # Box layout
            '.product-container'          # Container layout
        ]

        product_elements = []
        for selector in product_selectors:
            elements = soup.select(selector)
            if elements:
                product_elements = elements
                logger.debug(f"🎯 Found {len(elements)} products using: {selector}")
                break

        if not product_elements:
            # Fallback: look for any elements with product-like attributes
            fallback_elements = soup.select('div[class*="product"], li[class*="product"], article[class*="product"]')
            if fallback_elements:
                product_elements = fallback_elements[:20]  # Limit to prevent noise
                logger.debug(f"📦 Using fallback extraction: {len(product_elements)} elements")

        # Extract products with enhanced error handling
        for i, element in enumerate(product_elements):
            try:
                product = self._extract_single_product_advanced(element, category, subcategory, page)
                if product:
                    products.append(product)
            except Exception as e:
                logger.debug(f"❌ Error extracting product {i+1}: {e}")
                continue

        return products

    def _extract_single_product_advanced(self, element, category: str,
                                       subcategory: str, page: int) -> Optional[Product]:
        """Advanced single product extraction with comprehensive data mining"""
        try:
            # Enhanced name extraction
            name_selectors = [
                '.p-item-name a', '.p-item-name',
                '.product-title a', '.product-title',
                '.product-name a', '.product-name',
                'h4 a', 'h3 a', 'h2 a', 'h1 a',
                '.title a', '.title',
                '.name a', '.name',
                'a[title]'  # Fallback to title attribute
            ]
            name = self._get_text_by_selectors_advanced(element, name_selectors)

            # Enhanced price extraction
            price_selectors = [
                '.p-item-price', '.current-price', '.price-current',
                '.price', '.product-price', '.cost', '.amount',
                '[class*="price"]:not([class*="old"]):not([class*="original"])',
                '.tk', '.taka'  # Bangladesh currency indicators
            ]
            price = self._get_text_by_selectors_advanced(element, price_selectors)

            # Original price (discounted items)
            original_price_selectors = [
                '.p-item-price .old-price', '.old-price', '.original-price',
                '.was-price', '.price-old', '.strike-price',
                '[class*="old"], [class*="original"], [class*="was"]'
            ]
            original_price = self._get_text_by_selectors_advanced(element, original_price_selectors)

            # Discount information
            discount_selectors = [
                '.discount-badge', '.offer-badge', '.sale-badge',
                '.discount', '.offer', '.sale', '.save',
                '[class*="discount"], [class*="offer"], [class*="sale"]'
            ]
            discount = self._get_text_by_selectors_advanced(element, discount_selectors)

            # Model/SKU with enhanced detection
            model_selectors = [
                '.p-item-model', '.model', '.sku', '.product-code',
                '.part-number', '.item-code', '[data-sku]',
                '[class*="model"], [class*="sku"], [class*="code"]'
            ]
            model = self._get_text_by_selectors_advanced(element, model_selectors)

            # Brand extraction
            brand_selectors = [
                '.p-item-brand', '.brand', '.manufacturer', '.brand-name',
                '[data-brand]', '[class*="brand"], [class*="manufacturer"]'
            ]
            brand = self._get_text_by_selectors_advanced(element, brand_selectors)

            # Availability/Stock status
            availability_selectors = [
                '.p-item-stock', '.stock-status', '.availability',
                '.in-stock', '.out-of-stock', '.stock',
                '[class*="stock"], [class*="available"]'
            ]
            availability = self._get_text_by_selectors_advanced(element, availability_selectors)

            # Rating information
            rating_selectors = [
                '.rating', '.stars', '.review-rating', '.product-rating',
                '[class*="rating"], [class*="star"], [data-rating]'
            ]
            rating = self._get_text_by_selectors_advanced(element, rating_selectors)

            # Review count
            review_selectors = [
                '.review-count', '.reviews-count', '.rating-count',
                '[class*="review"] .count', '[class*="rating"] .count'
            ]
            review_count = self._get_text_by_selectors_advanced(element, review_selectors)

            # Enhanced image extraction
            image_url = self._extract_image_url_advanced(element)

            # Enhanced product URL extraction
            product_url = self._extract_product_url_advanced(element)

            # Description extraction
            desc_selectors = [
                '.p-item-desc', '.description', '.product-desc',
                '.short-desc', '.summary', '[class*="desc"]'
            ]
            description = self._get_text_by_selectors_advanced(element, desc_selectors)

            # Advanced specifications extraction
            specifications = self._extract_specifications_advanced(element)

            # Only create product if we have essential data
            if name and len(name.strip()) > 2:
                return Product(
                    name=self._clean_text(name),
                    price=self._clean_price(price) if price else "N/A",
                    original_price=self._clean_price(original_price) if original_price else "",
                    discount=self._clean_text(discount) if discount else "",
                    model=self._clean_text(model) if model else "",
                    brand=self._clean_text(brand) if brand else "",
                    availability=self._clean_text(availability) if availability else "N/A",
                    rating=self._clean_text(rating) if rating else "",
                    review_count=self._clean_text(review_count) if review_count else "",
                    image_url=image_url,
                    product_url=product_url,
                    specifications=specifications,
                    category=category,
                    subcategory=subcategory,
                    description=self._clean_text(description) if description else "",
                    page_number=page
                )

        except Exception as e:
            logger.debug(f"❌ Error in advanced product extraction: {e}")

        return None

    def _get_text_by_selectors_advanced(self, element, selectors: List[str]) -> str:
        """Advanced text extraction with multiple fallback strategies"""
        for selector in selectors:
            try:
                # Try standard selection
                found = element.select_one(selector)
                if found:
                    text = found.get_text(strip=True)
                    if text and len(text) > 0:
                        return text

                # Try attribute extraction for special cases
                if '[' in selector and ']' in selector:
                    attr_name = selector.split('[')[1].split(']')[0]
                    if '=' in attr_name:
                        attr_name = attr_name.split('=')[0]

                    elements_with_attr = element.find_all(attrs={attr_name: True})
                    for elem in elements_with_attr:
                        attr_value = elem.get(attr_name)
                        if attr_value:
                            return str(attr_value)
                        text = elem.get_text(strip=True)
                        if text:
                            return text

            except Exception as e:
                logger.debug(f"Selector '{selector}' failed: {e}")
                continue

        return ""

    def _extract_image_url_advanced(self, element) -> str:
        """Advanced image URL extraction with multiple sources"""
        # Try multiple image attributes
        img_attributes = ['src', 'data-src', 'data-lazy-src', 'data-original', 'data-zoom-image']

        # Find img tags
        img_tags = element.find_all('img')
        for img in img_tags:
            for attr in img_attributes:
                src = img.get(attr)
                if src and not src.startswith('data:'):  # Skip base64 images
                    return urljoin(self.base_url, src)

        # Try background images in CSS
        elements_with_bg = element.find_all(style=True)
        for elem in elements_with_bg:
            style = elem.get('style', '')
            if 'background-image' in style:
                # Extract URL from background-image CSS
                import re
                match = re.search(r'url\(["\']?([^"\']+)["\']?\)', style)
                if match:
                    return urljoin(self.base_url, match.group(1))

        return ""

    def _extract_product_url_advanced(self, element) -> str:
        """Advanced product URL extraction"""
        # Try different link strategies
        link_selectors = [
            'a[href*="/product"]',
            'a[href*="/item"]',
            'a[href*="/p/"]',
            '.p-item-name a',
            '.product-title a',
            'h1 a', 'h2 a', 'h3 a', 'h4 a',
            'a'  # Fallback to any link
        ]

        for selector in link_selectors:
            link = element.select_one(selector)
            if link and link.get('href'):
                href = link.get('href')
                if href and not href.startswith('#') and not href.startswith('javascript:'):
                    return urljoin(self.base_url, href)

        return ""

    def _extract_specifications_advanced(self, element) -> Dict[str, str]:
        """Advanced specifications extraction"""
        specs = {}

        # Look for specification containers
        spec_containers = element.select('.specs, .specifications, .features, .details, .p-item-specs')

        for container in spec_containers:
            # Try different spec formats

            # Format 1: Key-value pairs with colons
            spec_items = container.find_all(['div', 'span', 'li', 'p'])
            for item in spec_items:
                text = item.get_text(strip=True)
                if ':' in text and len(text.split(':')) == 2:
                    key, value = text.split(':', 1)
                    specs[key.strip()] = value.strip()

            # Format 2: Definition lists
            dt_elements = container.find_all('dt')
            dd_elements = container.find_all('dd')
            for dt, dd in zip(dt_elements, dd_elements):
                key = dt.get_text(strip=True)
                value = dd.get_text(strip=True)
                if key and value:
                    specs[key] = value

            # Format 3: Table rows
            rows = container.find_all('tr')
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if len(cells) >= 2:
                    key = cells[0].get_text(strip=True)
                    value = cells[1].get_text(strip=True)
                    if key and value:
                        specs[key] = value

        return specs

    def _clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if not text:
            return ""

        # Remove extra whitespace
        text = ' '.join(text.split())

        # Remove common unwanted characters
        unwanted_chars = ['\n', '\t', '\r', '\xa0']
        for char in unwanted_chars:
            text = text.replace(char, ' ')

        # Remove extra spaces again
        text = ' '.join(text.split())

        return text.strip()

    def _clean_price(self, price: str) -> str:
        """Clean and normalize price text"""
        if not price:
            return ""

        # Remove common price prefixes/suffixes
        price = self._clean_text(price)

        # Keep numbers, commas, periods, and currency symbols
        import re
        price = re.sub(r'[^\d,.\৳$€£¥₹]', ' ', price)
        price = ' '.join(price.split())

        return price.strip()

    def _has_next_page_advanced(self, soup: BeautifulSoup) -> bool:
        """Advanced next page detection"""
        next_selectors = [
            '.pagination .next:not(.disabled)',
            '.pagination a[aria-label="Next"]:not(.disabled)',
            '.next-page:not(.disabled)',
            '.pager .next:not(.disabled)',
            'a[rel="next"]',
            '.pagination-next'
        ]

        for selector in next_selectors:
            if soup.select(selector):
                return True

        # Check for page numbers
        page_links = soup.select('.pagination a, .pager a')
        current_page = 1
        max_page = 1

        for link in page_links:
            text = link.get_text(strip=True)
            if text.isdigit():
                page_num = int(text)
                max_page = max(max_page, page_num)
                if 'current' in link.get('class', []) or 'active' in link.get('class', []):
                    current_page = page_num

        return current_page < max_page

    def _deduplicate_products(self, products: List[Product]) -> List[Product]:
        """Remove duplicate products based on name and URL"""
        seen = set()
        deduplicated = []

        for product in products:
            # Create unique identifier
            identifier = f"{product.name}|{product.product_url}"
            if identifier not in seen:
                seen.add(identifier)
                deduplicated.append(product)

        removed_count = len(products) - len(deduplicated)
        if removed_count > 0:
            logger.info(f"🔄 Removed {removed_count} duplicate products")

        return deduplicated

class StarTechCompleteMasterScraper:
    """Master scraper that orchestrates complete StarTech scraping"""

    def __init__(self, base_url: str = "https://www.startech.com.bd", max_pages_per_category: int = 10):
        self.base_url = base_url
        self.category_extractor = StarTechCompleteExtractor(base_url)
        self.product_scraper = StarTechCompleteProductScraper(base_url, max_pages_per_category)

        self.categories_data = {}
        self.products_data = []
        self.stats = ScrapingStats()

        # File management
        self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.base_filename = f"startech_complete_{self.session_id}"

    async def run_complete_mega_scraping(self) -> Dict[str, Any]:
        """Run the complete mega scraping operation"""
        print("🚀 STARTECH COMPLETE MEGA SCRAPER INITIATED")
        print("="*70)
        print(f"🕐 Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"🆔 Session ID: {self.session_id}")
        print(f"📄 Max Pages per Category: {self.product_scraper.max_pages}")
        print("="*70)

        try:
            # Phase 1: Extract ALL categories
            print(f"\n📁 PHASE 1: COMPLETE CATEGORY EXTRACTION")
            print("-"*50)

            self.categories_data = await self.category_extractor.extract_all_categories_complete()

            if not self.categories_data.get('categories'):
                print("❌ CRITICAL: No categories found! Aborting.")
                return {}

            # Display category summary
            self._display_category_summary()

            # Phase 2: Complete product scraping
            print(f"\n🛍️ PHASE 2: COMPLETE PRODUCT SCRAPING")
            print("-"*50)

            category_list = self.categories_data.get('flat_processing_list', [])

            if not category_list:
                print("❌ No categories to process!")
                return {}

            print(f"📊 Total categories/subcategories to scrape: {len(category_list)}")

            # Estimate time
            estimated_hours = len(category_list) * 0.5 / 60  # 30 seconds per category average
            print(f"⏰ Estimated completion time: {estimated_hours:.1f} hours")

            # Confirm before proceeding
            print(f"\n🔄 Starting complete scraping in 5 seconds...")
            await asyncio.sleep(5)

            # Run complete scraping
            self.products_data = await self.product_scraper.scrape_all_categories_complete(category_list)

            # Phase 3: Save and report
            print(f"\n💾 PHASE 3: SAVING RESULTS & GENERATING REPORT")
            print("-"*50)

            # Save intermediate results during scraping
            await self._save_results_complete()

            # Generate final report
            final_report = self._generate_mega_report()

            # Display completion summary
            self._display_completion_summary(final_report)

            return final_report

        except KeyboardInterrupt:
            print(f"\n⚠️ SCRAPING INTERRUPTED BY USER")
            print("Saving partial results...")
            await self._save_results_complete()
            return {}

        except Exception as e:
            print(f"\n❌ CRITICAL ERROR: {e}")
            logger.error(f"Complete scraping failed: {e}")
            # Still try to save partial results
            try:
                await self._save_results_complete()
            except:
                pass
            return {}

    def _display_category_summary(self):
        """Display comprehensive category summary"""
        summary = self.categories_data.get('summary', {})
        categories = self.categories_data.get('categories', [])

        print(f"✅ CATEGORY EXTRACTION COMPLETE!")
        print(f"📊 Total Main Categories: {len(categories)}")
        print(f"📈 Total Items to Process: {summary.get('total_items_to_scrape', 0)}")
        print(f"🏗️ Maximum Depth Level: {summary.get('deepest_level', 0)}")
        print(f"⏱️ Estimated Time: {summary.get('estimated_scraping_time_hours', 0):.1f} hours")

        # Level distribution
        print(f"\n📊 LEVEL DISTRIBUTION:")
        level_dist = summary.get('level_distribution', {})
        for level, count in sorted(level_dist.items()):
            level_num = level.split('_')[1]
            print(f"   Level {level_num}: {count:3d} items")

        # Show sample categories
        print(f"\n📂 SAMPLE MAIN CATEGORIES:")
        for i, cat in enumerate(categories[:10], 1):
            sub_count = cat.get('total_subcategories', 0)
            print(f"   {i:2d}. {cat['name'][:50]:50s} ({sub_count:2d} subs)")

        if len(categories) > 10:
            remaining = len(categories) - 10
            print(f"   ... and {remaining} more categories")

    def _generate_mega_report(self) -> Dict[str, Any]:
        """Generate comprehensive mega report"""
        end_time = time.time()
        total_time = end_time - self.stats.start_time

        # Calculate advanced statistics
        products_by_category = {}
        products_by_brand = {}
        price_ranges = {'0-10k': 0, '10k-50k': 0, '50k-100k': 0, '100k+': 0}

        for product in self.products_data:
            # Category distribution
            cat_key = f"{product.category}"
            if product.subcategory:
                cat_key += f"/{product.subcategory}"
            products_by_category[cat_key] = products_by_category.get(cat_key, 0) + 1

            # Brand distribution
            if product.brand:
                products_by_brand[product.brand] = products_by_brand.get(product.brand, 0) + 1

            # Price analysis (basic)
            if product.price and product.price != "N/A":
                # Simple price range categorization
                import re
                price_numbers = re.findall(r'\d+', product.price.replace(',', ''))
                if price_numbers:
                    price_val = int(price_numbers[0])
                    if price_val < 10000:
                        price_ranges['0-10k'] += 1
                    elif price_val < 50000:
                        price_ranges['10k-50k'] += 1
                    elif price_val < 100000:
                        price_ranges['50k-100k'] += 1
                    else:
                        price_ranges['100k+'] += 1

        return {
            'scraping_session': {
                'session_id': self.session_id,
                'start_time': datetime.fromtimestamp(self.stats.start_time).isoformat(),
                'end_time': datetime.now().isoformat(),
                'total_duration_seconds': total_time,
                'total_duration_hours': total_time / 3600
            },
            'categories_summary': {
                'total_main_categories': self.categories_data.get('total_main_categories', 0),
                'total_items_processed': len(self.categories_data.get('flat_processing_list', [])),
                'categories_hierarchy': self.categories_data
            },
            'scraping_statistics': {
                'total_products_scraped': len(self.products_data),
                'categories_processed': self.product_scraper.stats.categories_processed,
                'pages_scraped': self.product_scraper.stats.pages_scraped,
                'errors_encountered': self.product_scraper.stats.errors_encountered,
                'blocked_requests': self.product_scraper.stats.blocked_requests,
                'scraping_rate_products_per_hour': len(self.products_data) / (total_time / 3600) if total_time > 0 else 0
            },
            'product_analysis': {
                'products_by_category': dict(sorted(products_by_category.items(), key=lambda x: x[1], reverse=True)[:50]),
                'top_brands': dict(sorted(products_by_brand.items(), key=lambda x: x[1], reverse=True)[:20]),
                'price_distribution': price_ranges,
                'products_with_images': len([p for p in self.products_data if p.image_url]),
                'products_with_specs': len([p for p in self.products_data if p.specifications]),
                'products_with_ratings': len([p for p in self.products_data if p.rating])
            },
            'data_quality': {
                'completeness_score': self._calculate_completeness_score(),
                'duplicate_removal_count': 0,  # Updated during deduplication
                'average_products_per_category': len(self.products_data) / max(1, self.product_scraper.stats.categories_processed)
            }
        }

    def _calculate_completeness_score(self) -> float:
        """Calculate data completeness score (0-100%)"""
        if not self.products_data:
            return 0.0

        total_fields = len(self.products_data) * 8  # 8 key fields to check
        filled_fields = 0

        for product in self.products_data:
            if product.name and len(product.name.strip()) > 2:
                filled_fields += 1
            if product.price and product.price != "N/A":
                filled_fields += 1
            if product.brand:
                filled_fields += 1
            if product.image_url:
                filled_fields += 1
            if product.product_url:
                filled_fields += 1
            if product.availability:
                filled_fields += 1
            if product.specifications:
                filled_fields += 1
            if product.description:
                filled_fields += 1

        return (filled_fields / total_fields) * 100 if total_fields > 0 else 0

    async def _save_results_complete(self):
        """Save complete results with multiple formats"""
        try:
            print(f"💾 Saving results with base filename: {self.base_filename}")

            # 1. Save categories hierarchy
            categories_file = f"{self.base_filename}_categories.json"
            with open(categories_file, 'w', encoding='utf-8') as f:
                json.dump(self.categories_data, f, indent=2, ensure_ascii=False)
            print(f"✅ Categories saved: {categories_file}")

            # 2. Save products (if any)
            if self.products_data:
                # CSV format (for Excel compatibility)
                products_csv = f"{self.base_filename}_products.csv"
                df = pd.DataFrame([asdict(product) for product in self.products_data])
                df.to_csv(products_csv, index=False, encoding='utf-8-sig')  # BOM for Excel
                print(f"✅ Products CSV saved: {products_csv} ({len(self.products_data)} products)")

                # JSON format (for developers)
                products_json = f"{self.base_filename}_products.json"
                with open(products_json, 'w', encoding='utf-8') as f:
                    json.dump([asdict(product) for product in self.products_data],
                             f, indent=2, ensure_ascii=False)
                print(f"✅ Products JSON saved: {products_json}")

                # Summary CSV (key metrics only)
                summary_data = []
                for product in self.products_data:
                    summary_data.append({
                        'Name': product.name,
                        'Price': product.price,
                        'Brand': product.brand,
                        'Category': product.category,
                        'Subcategory': product.subcategory,
                        'Availability': product.availability,
                        'URL': product.product_url
                    })

                summary_csv = f"{self.base_filename}_summary.csv"
                pd.DataFrame(summary_data).to_csv(summary_csv, index=False, encoding='utf-8-sig')
                print(f"✅ Summary CSV saved: {summary_csv}")

            # 3. Save complete report
            report_file = f"{self.base_filename}_report.json"
            report = self._generate_mega_report()
            with open(report_file, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            print(f"✅ Complete report saved: {report_file}")

            return True

        except Exception as e:
            print(f"❌ Error saving results: {e}")
            return False

    def _display_completion_summary(self, report: Dict):
        """Display final completion summary"""
        print(f"\n🎉 STARTECH COMPLETE SCRAPING FINISHED!")
        print("="*70)

        # Time summary
        duration = report['scraping_session']['total_duration_hours']
        print(f"⏰ Total Duration: {duration:.2f} hours")
        print(f"🕐 Start: {report['scraping_session']['start_time']}")
        print(f"🏁 End: {report['scraping_session']['end_time']}")

        # Scraping statistics
        stats = report['scraping_statistics']
        print(f"\n📊 SCRAPING STATISTICS:")
        print(f"   🛍️ Total Products: {stats['total_products_scraped']:,}")
        print(f"   📁 Categories Processed: {stats['categories_processed']}")
        print(f"   📄 Pages Scraped: {stats['pages_scraped']}")
        print(f"   ⚡ Rate: {stats['scraping_rate_products_per_hour']:.1f} products/hour")

        if stats['errors_encountered'] > 0:
            print(f"   ⚠️ Errors: {stats['errors_encountered']}")
        if stats['blocked_requests'] > 0:
            print(f"   🚫 Blocked: {stats['blocked_requests']}")

        # Data quality
        quality = report['data_quality']
        print(f"\n🎯 DATA QUALITY:")
        print(f"   📈 Completeness: {quality['completeness_score']:.1f}%")
        print(f"   📊 Avg Products/Category: {quality['average_products_per_category']:.1f}")

        # Top categories
        analysis = report['product_analysis']
        print(f"\n🏆 TOP 10 CATEGORIES BY PRODUCT COUNT:")
        top_cats = list(analysis['products_by_category'].items())[:10]
        for i, (category, count) in enumerate(top_cats, 1):
            print(f"   {i:2d}. {category[:50]:50s} {count:4d} products")

        # Top brands
        print(f"\n🏷️ TOP 10 BRANDS:")
        top_brands = list(analysis['top_brands'].items())[:10]
        for i, (brand, count) in enumerate(top_brands, 1):
            print(f"   {i:2d}. {brand[:30]:30s} {count:4d} products")

        # Files generated
        print(f"\n📁 FILES GENERATED:")
        print(f"   📊 {self.base_filename}_products.csv - All products (Excel compatible)")
        print(f"   📋 {self.base_filename}_products.json - All products (JSON)")
        print(f"   📄 {self.base_filename}_summary.csv - Key metrics only")
        print(f"   📂 {self.base_filename}_categories.json - Complete hierarchy")

🔧 Setting up COMPLETE scraping environment...
Installing beautifulsoup4...
Installing nest-asyncio...
Installing fake-useragent...
Installing asyncio-throttle...
✅ Enhanced Crawl4AI imports successful


In [5]:
# Install Playwright browsers
!playwright install

Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 11.5s[0K[1G173.7 MiB [] 0% 7.1s[0K[1G173.7 MiB [] 0% 4.6s[0K[1G173.7 MiB [] 1% 3.2s[0K[1G173.7 MiB [] 2% 2.9s[0K[1G173.7 MiB [] 2% 2.6s[0K[1G173.7 MiB [] 3% 2.5s[0K[1G173.7 MiB [] 4% 2.5s[0K[1G173.7 MiB [] 4% 2.7s[0K[1G173.7 MiB [] 5% 2.5s[0K[1G173.7 MiB [] 6% 2.5s[0K[1G173.7 MiB [] 7% 2.4s[0K[1G173.7 MiB [] 8% 2.3s[0K[1G173.7 MiB [] 9% 2.3s[0K[1G173.7 MiB [] 10% 2.3s[0K[1G173.7 MiB [] 11% 2.2s[0K[1G173.7 MiB [] 12% 2.1s[0K[1G173.7 MiB [] 13% 2.1s[0K[1G173.7 MiB [] 14% 2.0s[0K[1G173.7 MiB [] 15% 1.9s[0K[1G173.7 MiB [] 16% 1.9s[0K[1G173.7 MiB [] 17% 1.8s[0K[1G173.7 MiB [] 18% 1.8s[0K[1G173.7 MiB [] 19% 1.7s[0K[1G173.7 MiB [] 20% 1.8s[0K[1G173.7 MiB [] 21% 1.7s[0K[1G173.7 MiB [] 22% 1.7s[0K[1G173.7 MiB [] 24% 1.6s[0K[1G173.7 MiB [] 25% 1.5s[0K[1

In [6]:
# Instantiate the master scraper
master_scraper = StarTechCompleteMasterScraper(max_pages_per_category=3) # Limit pages for a sample run

# Run the complete scraping process
report = await master_scraper.run_complete_mega_scraping()

# Display the report (optional, done by the class method)
# print("\n--- Final Report ---")
# display(report)

🚀 STARTECH COMPLETE MEGA SCRAPER INITIATED
🕐 Start Time: 2025-09-03 19:08:47
🆔 Session ID: 20250903_190847
📄 Max Pages per Category: 3

📁 PHASE 1: COMPLETE CATEGORY EXTRACTION
--------------------------------------------------


✅ CATEGORY EXTRACTION COMPLETE!
📊 Total Main Categories: 144
📈 Total Items to Process: 1712
🏗️ Maximum Depth Level: 3
⏱️ Estimated Time: 1.0 hours

📊 LEVEL DISTRIBUTION:
   Level 1: 144 items
   Level 2: 988 items
   Level 3: 580 items

📂 SAMPLE MAIN CATEGORIES:
    1. Desktop                                            (29 subs)
    2. Star PC                                            ( 2 subs)
    3. Gaming PC                                          ( 2 subs)
    4. Brand PC                                           ( 6 subs)
    5. All-in-One PC                                      ( 7 subs)
    6. Portable Mini PC                                   ( 2 subs)
    7. Laptop                                             (54 subs)
    8. All Laptop                                         (15 subs)
    9. Gaming Laptop                                      ( 7 subs)
   10. Premium Ultrabook                                  ( 7 subs)
   ... and 134 more categories

🛍️ PHASE 2: COMPLETE PROD

Categories:   0%|          | 0/1712 [00:00<?, ?cat/s]

CancelledError: 