<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Product_Marketing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install crawl4ai aiohttp requests



In [4]:
# Cell 2: Import Libraries and Configure Settings
import os
import csv
import requests
import asyncio
import aiohttp
from datetime import datetime
from crawl4ai import AsyncWebCrawler
from PIL import Image
import io

# Directories in Colab
BASE_DIR = "/content/web_scraping_test"
UPLOADS_DIR = os.path.join(BASE_DIR, "uploads")
OUTPUTS_DIR = os.path.join(BASE_DIR, "outputs")
BASE_CSV_FILE = os.path.join(OUTPUTS_DIR, "base_images.csv")
SECONDARY_CSV_FILE = os.path.join(OUTPUTS_DIR, "secondary_images.csv")

# Create directories
os.makedirs(UPLOADS_DIR, exist_ok=True)
os.makedirs(OUTPUTS_DIR, exist_ok=True)

# Serper API key (replace with your key from serper.dev)
SERPER_API_KEY = "95c2797a69b167639c98ab054e8597d752c6fe6d"
SERPER_ENDPOINT = "https://api.serper.dev/images"

# Headers for requests
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0", "Content-Type": "application/json"}

In [5]:
# Cell 3: Define Helper Functions
# Async function to download and validate an image
async def download_image(session, url, save_path):
    try:
        async with session.get(url, headers=HEADERS, timeout=10) as response:
            if response.status == 200:
                content = await response.read()
                # Validate image content with Pillow
                try:
                    Image.open(io.BytesIO(content)).verify()
                    with open(save_path, 'wb') as f:
                        f.write(content)
                    print(f"Downloaded and validated {save_path}")
                    return True
                except Exception as e:
                    print(f"Invalid image content at {url}: {e}")
                    return False
            print(f"Failed to download {url}: Status {response.status}")
            return False
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

# Function to search images with Serper API
def search_images(query, site, image_type, domain):
    try:
        payload = {"q": query, "gl": "us", "hl": "en", "type": "images", "num": 5}
        response = requests.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
        response.raise_for_status()
        images = response.json().get("images", [])
        print(f"Found {len(images)} images for query: {query}")
        return [
            {"url": item["imageUrl"], "site": site, "type": image_type, "domain": domain}
            for item in images if "imageUrl" in item and item["imageUrl"].endswith(('.jpg', '.jpeg', '.png'))
        ]
    except Exception as e:
        print(f"Serper error for {query}: {e}")
        return []

# Async function to crawl a website with Crawl4AI
async def crawl_website(url, site_name, image_type, domain):
    results = []
    try:
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url=url, css_selector="img[src*='photo']")
            if result.success:
                print(f"Successfully crawled {url}")
                async with aiohttp.ClientSession() as session:
                    for idx, img_url in enumerate(result.extracted_content[:3]):
                        if not img_url.startswith('http'):
                            img_url = f"https://{site_name}.com{img_url}"
                        if not img_url.endswith(('.jpg', '.jpeg', '.png')):
                            continue
                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                        filename = f"{site_name}_{image_type}_{timestamp}_{idx}.jpg"
                        save_path = os.path.join(UPLOADS_DIR, filename)
                        if await download_image(session, img_url, save_path):
                            results.append({
                                'site': site_name,
                                'type': image_type,
                                'url': img_url,
                                'local_path': save_path,
                                'domain': domain
                            })
            else:
                print(f"Failed to crawl {url}")
    except Exception as e:
        print(f"Error crawling {url}: {e}")
    return results

# Function to save results to CSV
def save_to_csv(data, csv_file):
    with open(csv_file, mode='w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['site', 'type', 'url', 'local_path', 'domain'])
        writer.writeheader()
        for row in data:
            writer.writerow(row)
    print(f"Saved to {csv_file}")

# Cell 4: Main Script
async def main():
    # Search queries
    queries = [
        {"type": "base", "query": "urban background site:unsplash.com", "site": "unsplash", "domain": "urban"},
        {"type": "base", "query": "modern office site:pexels.com", "site": "pexels", "domain": "office"},
        {"type": "secondary", "query": "denim jacket isolated site:pexels.com", "site": "pexels", "domain": "clothing"},
        {"type": "secondary", "query": "smartphone isolated site:unsplash.com", "site": "unsplash", "domain": "electronics"}
    ]

    # Step 1: Search with Serper API
    base_results = []
    secondary_results = []
    serper_results = []
    for q in queries:
        images = search_images(q["query"], q["site"], q["type"], q["domain"])
        serper_results.extend(images)

    # Step 2: Download Serper images
    async with aiohttp.ClientSession() as session:
        for idx, item in enumerate(serper_results):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{item['site']}_{item['type']}_{timestamp}_{idx}.jpg"
            save_path = os.path.join(UPLOADS_DIR, filename)
            if await download_image(session, item['url'], save_path):
                result = {
                    'site': item['site'],
                    'type': item['type'],
                    'url': item['url'],
                    'local_path': save_path,
                    'domain': item['domain']
                }
                if item['type'] == "base" and len(base_results) < 10:
                    base_results.append(result)
                elif item['type'] == "secondary" and len(secondary_results) < 10:
                    secondary_results.append(result)

    # Step 3: Crawl with Crawl4AI if needed
    if len(base_results) < 10 or len(secondary_results) < 10:
        crawl_tasks = [
            crawl_website("https://unsplash.com/s/photos/background", "unsplash", "base", "urban"),
            crawl_website("https://pexels.com/search/background/", "pexels", "base", "office"),
            crawl_website("https://pexels.com/search/product/", "pexels", "secondary", "clothing"),
            crawl_website("https://unsplash.com/s/photos/product", "unsplash", "secondary", "electronics")
        ]
        for task_results in await asyncio.gather(*crawl_tasks):
            for result in task_results:
                if result['type'] == "base" and len(base_results) < 10:
                    base_results.append(result)
                elif result['type'] == "secondary" and len(secondary_results) < 10:
                    secondary_results.append(result)

    # Step 4: Save results to two CSV files
    if base_results:
        save_to_csv(base_results, BASE_CSV_FILE)
    if secondary_results:
        save_to_csv(secondary_results, SECONDARY_CSV_FILE)
    print(f"Collected {len(base_results)} base images and {len(secondary_results)} secondary images")

# Run the main function
await main()

Serper error for urban background site:unsplash.com: 404 Client Error: Not Found for url: https://api.serper.dev/images
Serper error for modern office site:pexels.com: 404 Client Error: Not Found for url: https://api.serper.dev/images
Serper error for denim jacket isolated site:pexels.com: 404 Client Error: Not Found for url: https://api.serper.dev/images
Serper error for smartphone isolated site:unsplash.com: 404 Client Error: Not Found for url: https://api.serper.dev/images
Error crawling https://unsplash.com/s/photos/background: BrowserType.launch: Executable doesn't exist at /root/.cache/ms-playwright/chromium-1169/chrome-linux/chrome
╔════════════════════════════════════════════════════════════╗
║ Looks like Playwright was just installed or updated.       ║
║ Please run the following command to download new browsers: ║
║                                                            ║
║     playwright install                                     ║
║                                     