In [None]:
# 1. Install required packages:
!pip install crawl4ai beautifulsoup4 pandas nest-asyncio aiohttp




In [None]:
# 2. Initialize Crawl4AI (one-time setup):
!python -m crawl4ai.async_crawler_strategy




In [None]:
!playwright install

Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 0.0s[0K[1G173.7 MiB [] 0% 52.9s[0K[1G173.7 MiB [] 0% 25.9s[0K[1G173.7 MiB [] 0% 16.2s[0K[1G173.7 MiB [] 0% 8.1s[0K[1G173.7 MiB [] 1% 4.6s[0K[1G173.7 MiB [] 2% 3.4s[0K[1G173.7 MiB [] 4% 2.7s[0K[1G173.7 MiB [] 5% 2.4s[0K[1G173.7 MiB [] 5% 2.5s[0K[1G173.7 MiB [] 6% 2.3s[0K[1G173.7 MiB [] 7% 2.2s[0K[1G173.7 MiB [] 9% 2.0s[0K[1G173.7 MiB [] 10% 1.9s[0K[1G173.7 MiB [] 12% 1.8s[0K[1G173.7 MiB [] 13% 1.8s[0K[1G173.7 MiB [] 14% 1.7s[0K[1G173.7 MiB [] 15% 1.6s[0K[1G173.7 MiB [] 17% 1.5s[0K[1G173.7 MiB [] 18% 1.4s[0K[1G173.7 MiB [] 20% 1.3s[0K[1G173.7 MiB [] 21% 1.3s[0K[1G173.7 MiB [] 23% 1.2s[0K[1G173.7 MiB [] 24% 1.2s[0K[1G173.7 MiB [] 26% 1.1s[0K[1G173.7 MiB [] 26% 1.2s[0K[1G173.7 MiB [] 27% 1.1s[0K[1G173.7 MiB [] 29% 1.1s[0K[1G173.7 MiB [] 31% 1.0s[0K

In [None]:
#!/usr/bin/env python3
"""
StarTech Complete Category & Product Scraper - Fixed for Google Colab
Compatible with Crawl4AI 0.7.4+ and optimized for Jupyter/Colab environment
Author: Professional Web Scraper
Date: 2025
"""

# Google Colab Setup
import sys
import subprocess
import importlib

def install_and_import(package):
    """Install package if not available"""
    try:
        importlib.import_module(package.split('==')[0].split('>=')[0])
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages for Colab
required_packages = [
    "crawl4ai>=0.7.4",
    "beautifulsoup4",
    "pandas",
    "nest-asyncio",
    "aiohttp",
    "lxml"
]

print("🔧 Setting up environment for Google Colab...")
for package in required_packages:
    install_and_import(package)

import asyncio
import json
import pandas as pd
import time
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict, field
from urllib.parse import urljoin, urlparse
import logging
import re
from bs4 import BeautifulSoup
import nest_asyncio

# Essential imports for Crawl4AI 0.7.4+
try:
    from crawl4ai import AsyncWebCrawler
    from crawl4ai.extraction_strategy import LLMExtractionStrategy, LLMConfig
    from crawl4ai.chunking_strategy import RegexChunking
    print("✅ Crawl4AI imports successful")
except ImportError as e:
    print(f"❌ Crawl4AI import failed: {e}")
    print("Installing crawl4ai...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "crawl4ai", "--upgrade"])
    from crawl4ai import AsyncWebCrawler
    from crawl4ai.extraction_strategy import LLMExtractionStrategy, LLMConfig
    from crawl4ai.chunking_strategy import RegexChunking

# Apply nest_asyncio for Jupyter/Colab compatibility
nest_asyncio.apply()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class SubCategory:
    """Subcategory data structure"""
    name: str
    url: str
    level: int
    parent: str
    children: List['SubCategory'] = field(default_factory=list)

@dataclass
class Category:
    """Main category data structure"""
    name: str
    url: str
    level: int
    subcategories: List[SubCategory] = field(default_factory=list)
    product_count: int = 0

@dataclass
class Product:
    """Product data structure"""
    name: str
    price: str
    original_price: str
    discount: str
    model: str
    brand: str
    availability: str
    rating: str
    review_count: str
    image_url: str
    product_url: str
    specifications: Dict[str, Any]
    category: str
    subcategory: str
    description: str

class StarTechCategoryExtractor:
    """Extract all categories and subcategories from StarTech navigation"""

    def __init__(self, base_url: str = "https://www.startech.com.bd"):
        self.base_url = base_url
        self.all_categories = []
        self.category_hierarchy = {}

    async def extract_all_categories(self) -> Dict[str, Any]:
        """Extract complete category hierarchy from StarTech"""
        logger.info("🔍 Starting complete category extraction...")

        # Crawl4AI 0.7.4+ configuration
        crawler_config = {
            'headless': True,
            'browser_type': 'chromium',
            'verbose': False
        }

        async with AsyncWebCrawler(**crawler_config) as crawler:
            try:
                # Simple crawl without LLM first (more reliable)
                result = await crawler.arun(
                    url=self.base_url,
                    word_count_threshold=10,
                    bypass_cache=True,
                    wait_for="css:nav, .navigation, .navbar",
                    delay_before_return_html=2.0
                )

                if result.success:
                    # Parse with BeautifulSoup (more reliable than LLM for structure)
                    soup = BeautifulSoup(result.html, 'html.parser')
                    categories = self._parse_navigation_structure(soup)

                    logger.info(f"✅ Extracted {len(categories)} main categories")
                    return self._build_category_hierarchy(categories)
                else:
                    logger.error(f"❌ Failed to crawl main page: {result.error_message}")
                    return {}

            except Exception as e:
                logger.error(f"❌ Error during category extraction: {e}")
                return {}

    def _parse_navigation_structure(self, soup: BeautifulSoup) -> List[Category]:
        """Parse the navigation structure from HTML"""
        categories = []

        # Enhanced selectors based on your HTML structure
        nav_selectors = [
            'li.nav-item.has-child',  # Exact match from your HTML
            '.nav-item.has-child',
            'nav .nav-item.has-child',
            '.navbar .nav-item.has-child',
            '.main-menu .nav-item.has-child',
            'ul.menu > li.has-child'
        ]

        main_nav_items = None
        for selector in nav_selectors:
            main_nav_items = soup.select(selector)
            if main_nav_items:
                logger.info(f"✅ Found {len(main_nav_items)} categories using selector: {selector}")
                break

        if not main_nav_items:
            logger.warning("⚠️ No main navigation items found, trying fallback selectors")
            # Fallback selectors
            fallback_selectors = ['li:has(ul)', '.dropdown > li', 'li.dropdown']
            for selector in fallback_selectors:
                main_nav_items = soup.select(selector)
                if main_nav_items:
                    logger.info(f"✅ Found categories using fallback selector: {selector}")
                    break

        for nav_item in main_nav_items:
            category = self._extract_category_from_nav_item(nav_item)
            if category:
                categories.append(category)

        logger.info(f"📊 Parsed {len(categories)} categories from navigation")
        return categories

    def _extract_category_from_nav_item(self, nav_item) -> Optional[Category]:
        """Extract category and all its subcategories from a nav item"""
        try:
            # Get main category link - matches your HTML structure
            main_link = nav_item.find('a', class_='nav-link')
            if not main_link:
                # Fallback
                main_link = nav_item.find('a')

            if not main_link:
                return None

            category_name = main_link.get_text(strip=True)
            category_url = urljoin(self.base_url, main_link.get('href', ''))

            # Skip empty categories
            if not category_name or len(category_name.strip()) < 2:
                return None

            category = Category(
                name=category_name,
                url=category_url,
                level=1,
                subcategories=[]
            )

            # Extract subcategories - matches your HTML structure
            dropdown_menu = nav_item.find('ul', class_='drop-down')
            if not dropdown_menu:
                dropdown_menu = nav_item.find('ul')  # Fallback

            if dropdown_menu:
                subcategories = self._extract_subcategories_recursive(
                    dropdown_menu, category_name, level=2
                )
                category.subcategories = subcategories
                logger.info(f"📁 {category_name}: {len(subcategories)} subcategories")

            return category

        except Exception as e:
            logger.error(f"❌ Error extracting category: {e}")
            return None

    def _extract_subcategories_recursive(self, menu_element, parent_name: str, level: int) -> List[SubCategory]:
        """Recursively extract subcategories from dropdown menus"""
        subcategories = []

        # Find nav items - matches your HTML structure
        nav_items = menu_element.find_all('li', class_='nav-item', recursive=False)
        if not nav_items:
            nav_items = menu_element.find_all('li', recursive=False)  # Fallback

        for nav_item in nav_items:
            link = nav_item.find('a', class_='nav-link')
            if not link:
                link = nav_item.find('a')  # Fallback

            if not link:
                continue

            name = link.get_text(strip=True)
            url = urljoin(self.base_url, link.get('href', ''))

            # Skip "Show All" links and empty names
            if ('show all' in name.lower() or 'see all' in name.lower() or
                'view all' in name.lower() or len(name.strip()) < 2):
                continue

            subcategory = SubCategory(
                name=name,
                url=url,
                level=level,
                parent=parent_name,
                children=[]
            )

            # Check for deeper nested menus (drop-menu-2, drop-menu-3, etc.)
            nested_menu = nav_item.find('ul', class_='drop-down')
            if not nested_menu:
                nested_menu = nav_item.find('ul')  # Fallback

            if nested_menu and level < 4:  # Prevent infinite recursion
                children = self._extract_subcategories_recursive(
                    nested_menu, name, level + 1
                )
                subcategory.children = children

            subcategories.append(subcategory)

        return subcategories

    def _build_category_hierarchy(self, categories: List[Category]) -> Dict[str, Any]:
        """Build a comprehensive hierarchy structure"""
        hierarchy = {
            'total_categories': len(categories),
            'categories': [],
            'flat_list': [],
            'summary': {}
        }

        for category in categories:
            cat_dict = {
                'name': category.name,
                'url': category.url,
                'level': category.level,
                'subcategories': self._subcategories_to_dict(category.subcategories),
                'total_subcategories': self._count_total_subcategories(category.subcategories)
            }

            hierarchy['categories'].append(cat_dict)

            # Add to flat list for easy access
            hierarchy['flat_list'].append({
                'name': category.name,
                'url': category.url,
                'level': 1,
                'parent': None
            })

            # Add subcategories to flat list
            self._add_subcategories_to_flat_list(
                category.subcategories, hierarchy['flat_list'], category.name
            )

        # Generate summary
        hierarchy['summary'] = self._generate_summary(hierarchy)

        return hierarchy

    def _subcategories_to_dict(self, subcategories: List[SubCategory]) -> List[Dict]:
        """Convert subcategories to dictionary format"""
        result = []
        for sub in subcategories:
            sub_dict = {
                'name': sub.name,
                'url': sub.url,
                'level': sub.level,
                'parent': sub.parent,
                'children': self._subcategories_to_dict(sub.children) if sub.children else []
            }
            result.append(sub_dict)
        return result

    def _count_total_subcategories(self, subcategories: List[SubCategory]) -> int:
        """Count total subcategories recursively"""
        total = len(subcategories)
        for sub in subcategories:
            total += self._count_total_subcategories(sub.children)
        return total

    def _add_subcategories_to_flat_list(self, subcategories: List[SubCategory], flat_list: List, parent: str):
        """Add subcategories to flat list recursively"""
        for sub in subcategories:
            flat_list.append({
                'name': sub.name,
                'url': sub.url,
                'level': sub.level,
                'parent': parent
            })

            if sub.children:
                self._add_subcategories_to_flat_list(sub.children, flat_list, sub.name)

    def _generate_summary(self, hierarchy: Dict) -> Dict:
        """Generate summary statistics"""
        level_counts = {}
        for item in hierarchy['flat_list']:
            level = item['level']
            level_counts[f'level_{level}'] = level_counts.get(f'level_{level}', 0) + 1

        return {
            'total_items': len(hierarchy['flat_list']),
            'level_distribution': level_counts,
            'deepest_level': max([item['level'] for item in hierarchy['flat_list']]) if hierarchy['flat_list'] else 0
        }

class StarTechProductScraper:
    """Scrape products using Crawl4AI with fixed configuration"""

    def __init__(self, base_url: str = "https://www.startech.com.bd"):
        self.base_url = base_url
        self.products = []
        self.max_pages = 5

    async def scrape_category_products(self, category_url: str, category_name: str,
                                     subcategory_name: str = "") -> List[Product]:
        """Scrape products from a category using Crawl4AI"""
        logger.info(f"🛍️ Scraping products from {category_name}/{subcategory_name}")

        products = []

        # Crawl4AI 0.7.4+ configuration
        crawler_config = {
            'headless': True,
            'browser_type': 'chromium',
            'verbose': False
        }

        async with AsyncWebCrawler(**crawler_config) as crawler:

            for page in range(1, self.max_pages + 1):
                page_url = self._construct_page_url(category_url, page)

                logger.info(f"📄 Scraping page {page}: {page_url}")

                try:
                    # Use simple crawling without LLM (more reliable and faster)
                    result = await crawler.arun(
                        url=page_url,
                        word_count_threshold=10,
                        bypass_cache=True,
                        wait_for="css:.product-item, .p-item, .product-card",
                        delay_before_return_html=2.0
                    )

                    if result.success:
                        # Parse with BeautifulSoup
                        soup = BeautifulSoup(result.html, 'html.parser')
                        page_products = self._extract_products_from_html(
                            soup, category_name, subcategory_name
                        )

                        if not page_products:
                            logger.info(f"⚠️ No products found on page {page}")
                            break

                        products.extend(page_products)
                        logger.info(f"✅ Extracted {len(page_products)} products from page {page}")

                        # Add delay between pages
                        await asyncio.sleep(1)

                        # Check if this is the last page
                        if not self._has_next_page(soup):
                            logger.info(f"📄 Reached last page at page {page}")
                            break
                    else:
                        logger.error(f"❌ Failed to crawl page {page}: {result.error_message}")
                        break

                except Exception as e:
                    logger.error(f"❌ Error scraping page {page}: {e}")
                    break

        return products

    def _construct_page_url(self, base_url: str, page: int) -> str:
        """Construct pagination URL for StarTech"""
        if page == 1:
            return base_url

        # StarTech pagination patterns
        if '?' in base_url:
            return f"{base_url}&page={page}"
        else:
            return f"{base_url}?page={page}"

    def _extract_products_from_html(self, soup: BeautifulSoup, category: str, subcategory: str) -> List[Product]:
        """Extract products from HTML using BeautifulSoup"""
        products = []

        # StarTech-specific product selectors (based on common patterns)
        product_selectors = [
            '.p-item',              # Most common StarTech selector
            '.product-item',        # Alternative selector
            '.main-product',        # Another common selector
            '.product-layout',      # Layout-based selector
            '.product-card',        # Card-based layout
            '[data-product-id]',    # Data attribute selector
            '.product-box'          # Box-based layout
        ]

        product_elements = []
        for selector in product_selectors:
            elements = soup.select(selector)
            if elements:
                product_elements = elements
                logger.info(f"🎯 Found {len(elements)} products using selector: {selector}")
                break

        if not product_elements:
            logger.warning("⚠️ No product elements found with any selector")
            return products

        for element in product_elements:
            product = self._extract_single_product(element, category, subcategory)
            if product:
                products.append(product)

        return products

    def _extract_single_product(self, element, category: str, subcategory: str) -> Optional[Product]:
        """Extract single product data from element"""
        try:
            # Product name - StarTech specific selectors
            name_selectors = [
                '.p-item-name a',       # StarTech specific
                '.p-item-name',         # Alternative
                '.product-title a',     # Generic
                '.product-name a',      # Generic
                'h4 a', 'h3 a', 'h2 a', # Header links
                '.title a'              # Generic title
            ]
            name = self._get_text_by_selectors(element, name_selectors)

            # Prices - StarTech specific
            price_selectors = [
                '.p-item-price',        # StarTech specific
                '.current-price',       # Current price
                '.price',               # Generic price
                '.product-price',       # Product price
                '.price-current'        # Current price variant
            ]
            price = self._get_text_by_selectors(element, price_selectors)

            # Original price (for discounts)
            original_price_selectors = [
                '.p-item-price .old-price',  # StarTech specific
                '.old-price',                # Generic
                '.original-price',           # Original
                '.was-price',                # Was price
                '.price-old'                 # Old price
            ]
            original_price = self._get_text_by_selectors(element, original_price_selectors)

            # Discount/Offer
            discount_selectors = [
                '.discount-badge', '.offer-badge', '.sale-badge',
                '.discount', '.offer', '.sale'
            ]
            discount = self._get_text_by_selectors(element, discount_selectors)

            # Model/SKU
            model_selectors = [
                '.p-item-model', '.model', '.sku', '.product-code', '.part-number'
            ]
            model = self._get_text_by_selectors(element, model_selectors)

            # Brand
            brand_selectors = [
                '.p-item-brand', '.brand', '.manufacturer', '.brand-name'
            ]
            brand = self._get_text_by_selectors(element, brand_selectors)

            # Availability/Stock
            availability_selectors = [
                '.p-item-stock', '.stock-status', '.availability',
                '.in-stock', '.out-of-stock', '.stock'
            ]
            availability = self._get_text_by_selectors(element, availability_selectors)

            # Rating
            rating_selectors = [
                '.rating', '.stars', '.review-rating', '.product-rating'
            ]
            rating = self._get_text_by_selectors(element, rating_selectors)

            # Review count
            review_selectors = [
                '.review-count', '.reviews-count', '.rating-count'
            ]
            review_count = self._get_text_by_selectors(element, review_selectors)

            # Image
            img = element.find('img')
            image_url = ""
            if img:
                src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
                if src:
                    image_url = urljoin(self.base_url, src)

            # Product URL
            link = element.find('a', href=True)
            product_url = ""
            if link:
                product_url = urljoin(self.base_url, link.get('href', ''))

            # Description
            desc_selectors = [
                '.p-item-desc', '.description', '.product-desc', '.short-desc'
            ]
            description = self._get_text_by_selectors(element, desc_selectors)

            # Basic specifications
            specifications = self._extract_specifications(element)

            # Only create product if we have essential data
            if name and len(name.strip()) > 2:
                return Product(
                    name=name.strip(),
                    price=price.strip() if price else "N/A",
                    original_price=original_price.strip() if original_price else "",
                    discount=discount.strip() if discount else "",
                    model=model.strip() if model else "",
                    brand=brand.strip() if brand else "",
                    availability=availability.strip() if availability else "N/A",
                    rating=rating.strip() if rating else "",
                    review_count=review_count.strip() if review_count else "",
                    image_url=image_url,
                    product_url=product_url,
                    specifications=specifications,
                    category=category,
                    subcategory=subcategory,
                    description=description.strip() if description else ""
                )

        except Exception as e:
            logger.error(f"❌ Error extracting product: {e}")

        return None

    def _get_text_by_selectors(self, element, selectors: List[str]) -> str:
        """Get text using multiple selectors"""
        for selector in selectors:
            found = element.select_one(selector)
            if found:
                text = found.get_text(strip=True)
                if text:  # Only return non-empty text
                    return text
        return ""

    def _extract_specifications(self, element) -> Dict[str, str]:
        """Extract specifications from product element"""
        specs = {}

        # Look for specification elements
        spec_elements = element.select('.spec, .specification, .feature, .p-item-spec')
        for spec_element in spec_elements:
            text = spec_element.get_text(strip=True)
            if ':' in text:
                parts = text.split(':', 1)
                if len(parts) == 2:
                    key, value = parts
                    specs[key.strip()] = value.strip()

        return specs

    def _has_next_page(self, soup: BeautifulSoup) -> bool:
        """Check if there's a next page"""
        next_selectors = [
            '.pagination .next:not(.disabled)',
            '.pagination a[aria-label="Next"]',
            '.next-page:not(.disabled)',
            '.pager .next:not(.disabled)'
        ]

        for selector in next_selectors:
            if soup.select(selector):
                return True
        return False

class StarTechCompleteScraper:
    """Main scraper class - Colab optimized"""

    def __init__(self, base_url: str = "https://www.startech.com.bd"):
        self.base_url = base_url
        self.category_extractor = StarTechCategoryExtractor(base_url)
        self.product_scraper = StarTechProductScraper(base_url)
        self.categories_data = {}
        self.products_data = []

    async def run_complete_scraping(self) -> Dict[str, Any]:
        """Run complete scraping process"""
        print("🚀 STARTING STARTECH COMPLETE SCRAPER")
        print("=" * 60)

        try:
            # Step 1: Extract all categories
            print("\n📁 STEP 1: Extracting All Categories & Subcategories...")
            print("-" * 50)

            self.categories_data = await self.category_extractor.extract_all_categories()

            if not self.categories_data.get('categories'):
                print("❌ No categories found! Check the website structure.")
                return {}

            self._print_category_hierarchy()

            # Step 2: Get top 10 categories for product scraping
            print(f"\n🛍️ STEP 2: Scraping Products from Top 10 Categories...")
            print("-" * 50)

            top_categories = self._get_top_categories(10)

            for i, category in enumerate(top_categories, 1):
                print(f"\n--- Processing Category {i}/10: {category['name']} ---")

                # Scrape main category products
                try:
                    main_products = await self.product_scraper.scrape_category_products(
                        category['url'], category['name']
                    )
                    self.products_data.extend(main_products)
                    print(f"✅ Main category: {len(main_products)} products")
                except Exception as e:
                    print(f"❌ Error in main category: {e}")

                # Scrape subcategories (limit to top 5)
                subcategories = category.get('subcategories', [])[:5]
                for j, sub in enumerate(subcategories, 1):
                    try:
                        print(f"  📂 Processing subcategory {j}/5: {sub['name']}")
                        sub_products = await self.product_scraper.scrape_category_products(
                            sub['url'], category['name'], sub['name']
                        )
                        self.products_data.extend(sub_products)
                        print(f"     ✅ {len(sub_products)} products")
                    except Exception as e:
                        print(f"     ❌ Error in subcategory: {e}")

                print(f"📊 Total products so far: {len(self.products_data)}")

            # Step 3: Generate final report
            return self._generate_final_report()

        except Exception as e:
            logger.error(f"❌ Complete scraping failed: {e}")
            print(f"❌ SCRAPING FAILED: {e}")
            return {}

    def _print_category_hierarchy(self):
        """Print the complete category hierarchy"""
        if not self.categories_data:
            return

        print(f"\n📊 STARTECH CATEGORY HIERARCHY EXTRACTED")
        print("=" * 60)

        summary = self.categories_data.get('summary', {})
        print(f"📈 Total Categories/Subcategories: {summary.get('total_items', 0)}")
        print(f"🏗️ Maximum Depth Level: {summary.get('deepest_level', 0)}")

        level_dist = summary.get('level_distribution', {})
        for level, count in level_dist.items():
            level_num = level.split('_')[1]
            print(f"   📁 Level {level_num}: {count} items")

        print(f"\n📂 HIERARCHICAL STRUCTURE:")
        print("-" * 40)

        categories = self.categories_data.get('categories', [])
        for i, category in enumerate(categories, 1):
            sub_count = category['total_subcategories']
            print(f"\n{i:2d}. 🏷️ {category['name']} ({sub_count} subcategories)")
            print(f"     🔗 {category['url']}")

            # Show first few subcategories
            subcats = category['subcategories'][:3]  # Show first 3
            for sub in subcats:
                print(f"     ├─ {sub['name']}")
                # Show children if any
                if sub.get('children'):
                    for child in sub['children'][:2]:  # Show first 2 children
                        print(f"     │  └─ {child['name']}")

            if len(category['subcategories']) > 3:
                remaining = len(category['subcategories']) - 3
                print(f"     └─ ... and {remaining} more subcategories")

    def _get_top_categories(self, count: int = 10) -> List[Dict]:
        """Get top N categories for product scraping"""
        categories = self.categories_data.get('categories', [])
        return categories[:count]

    def _generate_final_report(self) -> Dict[str, Any]:
        """Generate comprehensive final report"""
        return {
            'scraping_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
            'categories_summary': self.categories_data.get('summary', {}),
            'total_categories_found': len(self.categories_data.get('categories', [])),
            'total_products_scraped': len(self.products_data),
            'products_by_category': self._group_products_by_category(),
            'categories_hierarchy': self.categories_data,
            'products_data': [asdict(product) for product in self.products_data]
        }

    def _group_products_by_category(self) -> Dict[str, int]:
        """Group products by category for summary"""
        category_counts = {}
        for product in self.products_data:
            key = f"{product.category}"
            if product.subcategory:
                key += f"/{product.subcategory}"
            category_counts[key] = category_counts.get(key, 0) + 1
        return category_counts

    def save_results(self, base_filename: str = "startech_complete"):
        """Save all results to files - Colab optimized"""
        try:
            # Save categories hierarchy
            categories_file = f"{base_filename}_categories.json"
            with open(categories_file, 'w', encoding='utf-8') as f:
                json.dump(self.categories_data, f, indent=2, ensure_ascii=False)
            print(f"✅ Saved categories to: {categories_file}")

            # Save products data
            if self.products_data:
                # CSV format
                products_csv = f"{base_filename}_products.csv"
                df = pd.DataFrame([asdict(product) for product in self.products_data])
                df.to_csv(products_csv, index=False)
                print(f"✅ Saved products CSV to: {products_csv}")

                # JSON format
                products_json = f"{base_filename}_products.json"
                with open(products_json, 'w', encoding='utf-8') as f:
                    json.dump([asdict(product) for product in self.products_data],
                             f, indent=2, ensure_ascii=False)
                print(f"✅ Saved products JSON to: {products_json}")

            # Save complete report
            report_file = f"{base_filename}_complete_report.json"
            report = self._generate_final_report()
            with open(report_file, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            print(f"✅ Saved complete report to: {report_file}")

            return True

        except Exception as e:
            print(f"❌ Error saving files: {e}")
            return False

# Main execution functions for Google Colab
async def run_scraper_async():
    """Main async function for Google Colab"""
    print("🎯 STARTECH COMPLETE SCRAPER - COLAB EDITION")
    print("🔧 Optimized for Crawl4AI 0.7.4+")
    print("=" * 60)

    scraper = StarTechCompleteScraper()

    try:
        # Run complete scraping
        start_time = time.time()
        final_report = await scraper.run_complete_scraping()
        end_time = time.time()

        if not final_report:
            print("❌ Scraping failed - no data collected")
            return None

        # Save results
        print(f"\n💾 SAVING RESULTS...")
        print("-" * 30)
        success = scraper.save_results()

        if not success:
            print("⚠️ Some files may not have been saved properly")

        # Print final summary
        elapsed_time = end_time - start_time
        print(f"\n🎉 SCRAPING COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        print(f"⏱️ Total Time: {elapsed_time:.1f} seconds")
        print(f"📊 Total Categories Found: {final_report.get('total_categories_found', 0)}")
        print(f"🛍️ Total Products Scraped: {final_report.get('total_products_scraped', 0)}")
        print(f"⏰ Completed At: {final_report.get('scraping_timestamp', 'Unknown')}")

        # Show category summary
        categories_summary = final_report.get('categories_summary', {})
        print(f"\n📁 CATEGORY BREAKDOWN:")
        level_dist = categories_summary.get('level_distribution', {})
        for level, count in level_dist.items():
            level_num = level.split('_')[1]
            print(f"   📂 Level {level_num}: {count} categories")

        print(f"\n📋 FILES GENERATED:")
        print(f"   📄 startech_complete_categories.json - Complete category hierarchy")
        print(f"   📊 startech_complete_products.csv - Products in CSV format")
        print(f"   📋 startech_complete_products.json - Products in JSON format")
        print(f"   📈 startech_complete_complete_report.json - Complete scraping report")

        # Show top categories by product count
        products_by_cat = final_report.get('products_by_category', {})
        if products_by_cat:
            print(f"\n🏆 TOP CATEGORIES BY PRODUCT COUNT:")
            sorted_categories = sorted(products_by_cat.items(), key=lambda x: x[1], reverse=True)
            for i, (category, count) in enumerate(sorted_categories[:10], 1):
                print(f"   {i:2d}. {category}: {count} products")

        # Show sample products
        if scraper.products_data:
            print(f"\n🛍️ SAMPLE PRODUCTS:")
            print("-" * 30)
            for i, product in enumerate(scraper.products_data[:5], 1):
                print(f"{i}. 📱 {product.name}")
                print(f"   💰 Price: {product.price}")
                print(f"   📂 Category: {product.category}")
                if product.subcategory:
                    print(f"   📁 Subcategory: {product.subcategory}")
                print(f"   🔗 {product.product_url}")
                print()

        return final_report

    except Exception as e:
        print(f"❌ SCRAPING FAILED: {e}")
        logger.error(f"Complete scraping failed: {e}")
        return None

def run_scraper():
    """Synchronous wrapper for Google Colab"""
    return asyncio.run(run_scraper_async())

# Quick category-only extraction for testing
async def extract_categories_only():
    """Extract only categories for quick testing"""
    print("🔍 EXTRACTING CATEGORIES ONLY (QUICK TEST)")
    print("=" * 50)

    extractor = StarTechCategoryExtractor()
    categories_data = await extractor.extract_all_categories()

    if categories_data:
        print(f"✅ Successfully extracted {len(categories_data.get('categories', []))} categories")

        # Save just categories
        with open("startech_categories_only.json", 'w', encoding='utf-8') as f:
            json.dump(categories_data, f, indent=2, ensure_ascii=False)

        # Print summary
        summary = categories_data.get('summary', {})
        print(f"📊 Total items: {summary.get('total_items', 0)}")
        print(f"🏗️ Max depth: {summary.get('deepest_level', 0)}")

        # Show first few categories
        categories = categories_data.get('categories', [])
        print(f"\n📂 FIRST 5 CATEGORIES:")
        for i, cat in enumerate(categories[:5], 1):
            print(f"  {i}. {cat['name']} ({cat['total_subcategories']} subs)")

        return categories_data
    else:
        print("❌ Failed to extract categories")
        return None

def test_categories():
    """Test function to extract categories only"""
    return asyncio.run(extract_categories_only())

# Demo/Test Functions for Colab
def show_usage():
    """Show usage instructions for Google Colab"""
    print("🚀 STARTECH SCRAPER - COLAB USAGE")
    print("=" * 50)
    print()
    print("📖 AVAILABLE FUNCTIONS:")
    print()
    print("1️⃣ FULL SCRAPING (Categories + Products):")
    print("   result = run_scraper()")
    print("   • Extracts all categories & subcategories")
    print("   • Scrapes products from top 10 categories")
    print("   • Saves results to multiple files")
    print()
    print("2️⃣ CATEGORIES ONLY (Quick Test):")
    print("   categories = test_categories()")
    print("   • Only extracts category hierarchy")
    print("   • Faster for testing website structure")
    print()
    print("3️⃣ SHOW THIS HELP:")
    print("   show_usage()")
    print()
    print("📁 OUTPUT FILES:")
    print("   • startech_complete_categories.json")
    print("   • startech_complete_products.csv")
    print("   • startech_complete_products.json")
    print("   • startech_complete_complete_report.json")
    print()
    print("💡 TIP: Start with test_categories() to verify site access")

# Initialize for immediate use in Colab
print("🎯 StarTech Scraper Loaded Successfully!")
print("📚 Run show_usage() for instructions")
print("🚀 Run run_scraper() to start complete scraping")
print("🔍 Run test_categories() for quick category test")

# Installation verification
def verify_installation():
    """Verify all required packages are installed"""
    required_modules = ['crawl4ai', 'bs4', 'pandas', 'nest_asyncio']
    missing = []

    for module in required_modules:
        try:
            importlib.import_module(module)
            print(f"✅ {module}")
        except ImportError:
            print(f"❌ {module}")
            missing.append(module)

    if missing:
        print(f"\n⚠️ Missing modules: {', '.join(missing)}")
        print("Run the installation commands at the top of the script")
        return False
    else:
        print(f"\n🎉 All modules installed correctly!")
        return True

# Auto-verify installation when script loads
verify_installation()
test_categories()

"""
🚀 GOOGLE COLAB QUICK START GUIDE
===============================================

1. INSTALLATION (Run once):
   The script auto-installs required packages

2. BASIC USAGE:
   # Test category extraction first
   categories = test_categories()

   # Run complete scraping
   result = run_scraper()

3. CUSTOMIZATION:
   • Change max_pages in StarTechProductScraper.__init__()
   • Modify top categories count in _get_top_categories()
   • Adjust selectors in _extract_products_from_html()

4. TROUBLESHOOTING:
   • If crawling fails, check internet connection
   • StarTech may have anti-bot measures
   • Try test_categories() first to verify access
   • Check the generated log messages for issues

5. OUTPUT FILES:
   All files are saved in the current directory
   Download them from Colab's file browser

🔧 OPTIMIZATIONS FOR COLAB:
✅ Fixed Crawl4AI 0.7.4+ configuration
✅ Removed deprecated LLM provider settings
✅ Added nest_asyncio for Jupyter compatibility
✅ Enhanced error handling and logging
✅ Automatic package installation
✅ Progress indicators and status messages
"""

🔧 Setting up environment for Google Colab...
Installing beautifulsoup4...
Installing nest-asyncio...
✅ Crawl4AI imports successful
🎯 StarTech Scraper Loaded Successfully!
📚 Run show_usage() for instructions
🚀 Run run_scraper() to start complete scraping
🔍 Run test_categories() for quick category test
✅ crawl4ai
✅ bs4
✅ pandas
✅ nest_asyncio

🎉 All modules installed correctly!
🔍 EXTRACTING CATEGORIES ONLY (QUICK TEST)


✅ Successfully extracted 143 categories
📊 Total items: 2206
🏗️ Max depth: 3

📂 FIRST 5 CATEGORIES:
  1. Desktop (29 subs)
  2. Star PC (2 subs)
  3. Gaming PC (2 subs)
  4. Brand PC (6 subs)
  5. All-in-One PC (7 subs)




In [None]:
# # 1. Test category extraction first (recommended)
# categories = test_categories()

# 2. Run complete scraping (categories + products)
result = run_scraper()

# 3. Show usage instructions
show_usage()

🎯 STARTECH COMPLETE SCRAPER - COLAB EDITION
🔧 Optimized for Crawl4AI 0.7.4+
🚀 STARTING STARTECH COMPLETE SCRAPER

📁 STEP 1: Extracting All Categories & Subcategories...
--------------------------------------------------



📊 STARTECH CATEGORY HIERARCHY EXTRACTED
📈 Total Categories/Subcategories: 2206
🏗️ Maximum Depth Level: 3
   📁 Level 1: 143 items
   📁 Level 2: 1243 items
   📁 Level 3: 820 items

📂 HIERARCHICAL STRUCTURE:
----------------------------------------

 1. 🏷️ Desktop (29 subcategories)
     🔗 https://www.startech.com.bd/desktops
     ├─ Desktop Offer
     ├─ Star PC
     │  └─ Intel PC
     │  └─ Ryzen PC
     ├─ Gaming PC
     │  └─ Intel PC
     │  └─ RYZEN PC
     └─ ... and 7 more subcategories

 2. 🏷️ Star PC (2 subcategories)
     🔗 https://www.startech.com.bd/desktops/star-pc
     ├─ Intel PC
     ├─ Ryzen PC

 3. 🏷️ Gaming PC (2 subcategories)
     🔗 https://www.startech.com.bd/desktops/gaming-pc
     ├─ Intel PC
     ├─ RYZEN PC

 4. 🏷️ Brand PC (6 subcategories)
     🔗 https://www.startech.com.bd/desktops/brand-pc
     ├─ Acer
     ├─ ASUS
     ├─ Dell
     └─ ... and 3 more subcategories

 5. 🏷️ All-in-One PC (7 subcategories)
     🔗 https://www.startech.com.bd/desktops/all-in-on

✅ Main category: 20 products
  📂 Processing subcategory 1/5: Desktop Offer


     ✅ 20 products
  📂 Processing subcategory 2/5: Star PC


     ✅ 10 products
  📂 Processing subcategory 3/5: Gaming PC


     ✅ 20 products
  📂 Processing subcategory 4/5: Brand PC


     ✅ 20 products
  📂 Processing subcategory 5/5: All-in-One PC


     ✅ 20 products
📊 Total products so far: 110

--- Processing Category 2/10: Star PC ---


✅ Main category: 10 products
  📂 Processing subcategory 1/5: Intel PC


     ✅ 5 products
  📂 Processing subcategory 2/5: Ryzen PC


     ✅ 12 products
📊 Total products so far: 137

--- Processing Category 3/10: Gaming PC ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: Intel PC


     ✅ 8 products
  📂 Processing subcategory 2/5: RYZEN PC


     ✅ 20 products
📊 Total products so far: 185

--- Processing Category 4/10: Brand PC ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: Acer


     ✅ 3 products
  📂 Processing subcategory 2/5: ASUS


     ✅ 3 products
  📂 Processing subcategory 3/5: Dell


     ✅ 20 products
  📂 Processing subcategory 4/5: HP


     ✅ 10 products
  📂 Processing subcategory 5/5: Lenovo


     ✅ 13 products
📊 Total products so far: 254

--- Processing Category 5/10: All-in-One PC ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: Dell


     ✅ 4 products
  📂 Processing subcategory 2/5: HP


     ✅ 5 products
  📂 Processing subcategory 3/5: ASUS


     ✅ 4 products
  📂 Processing subcategory 4/5: LENOVO


     ✅ 1 products
  📂 Processing subcategory 5/5: Teclast


     ✅ 2 products
📊 Total products so far: 290

--- Processing Category 6/10: Portable Mini PC ---


✅ Main category: 16 products
  📂 Processing subcategory 1/5: Asus


     ✅ 14 products
  📂 Processing subcategory 2/5: Zotac


     ✅ 2 products
📊 Total products so far: 322

--- Processing Category 7/10: Laptop ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: All Laptop


     ✅ 20 products
  📂 Processing subcategory 2/5: Gaming Laptop


     ✅ 20 products
  📂 Processing subcategory 3/5: Premium Ultrabook


     ✅ 20 products
  📂 Processing subcategory 4/5: Laptop Bag


     ✅ 20 products
  📂 Processing subcategory 5/5: Laptop Accessories


     ✅ 20 products
📊 Total products so far: 442

--- Processing Category 8/10: All Laptop ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: MSI


     ✅ 20 products
  📂 Processing subcategory 2/5: Lenovo


     ✅ 20 products
  📂 Processing subcategory 3/5: Asus


     ✅ 20 products
  📂 Processing subcategory 4/5: HP


     ✅ 20 products
  📂 Processing subcategory 5/5: Acer


     ✅ 20 products
📊 Total products so far: 562

--- Processing Category 9/10: Gaming Laptop ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: HP


     ✅ 6 products
  📂 Processing subcategory 2/5: Lenovo


     ✅ 17 products
  📂 Processing subcategory 3/5: Asus


     ✅ 20 products
  📂 Processing subcategory 4/5: MSI


     ✅ 20 products
  📂 Processing subcategory 5/5: Acer


     ✅ 6 products
📊 Total products so far: 651

--- Processing Category 10/10: Premium Ultrabook ---


✅ Main category: 20 products
  📂 Processing subcategory 1/5: Asus


     ✅ 6 products
  📂 Processing subcategory 2/5: Acer


     ✅ 3 products
  📂 Processing subcategory 3/5: HP


     ✅ 3 products
  📂 Processing subcategory 4/5: Microsoft


     ✅ 12 products
  📂 Processing subcategory 5/5: Dell


     ✅ 2 products
📊 Total products so far: 697

💾 SAVING RESULTS...
------------------------------
✅ Saved categories to: startech_complete_categories.json
✅ Saved products CSV to: startech_complete_products.csv
✅ Saved products JSON to: startech_complete_products.json
✅ Saved complete report to: startech_complete_complete_report.json

🎉 SCRAPING COMPLETED SUCCESSFULLY!
⏱️ Total Time: 219.8 seconds
📊 Total Categories Found: 143
🛍️ Total Products Scraped: 697
⏰ Completed At: 2025-09-02 20:45:00

📁 CATEGORY BREAKDOWN:
   📂 Level 1: 143 categories
   📂 Level 2: 1243 categories
   📂 Level 3: 820 categories

📋 FILES GENERATED:
   📄 startech_complete_categories.json - Complete category hierarchy
   📊 startech_complete_products.csv - Products in CSV format
   📋 startech_complete_products.json - Products in JSON format
   📈 startech_complete_complete_report.json - Complete scraping report

🏆 TOP CATEGORIES BY PRODUCT COUNT:
    1. Desktop: 20 products
    2. Desktop/Desktop Offer: 20 products
