In [None]:
# Part 1: Install Chrome and the required Python libraries
!apt-get update
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get install -f -y
!pip install selenium webdriver-manager

# Part 2: Import libraries and set up Selenium
import os
import time
import requests
from urllib.parse import urljoin
import shutil
import re # We need this for regular expressions
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from google.colab import files

# Set up headless options for the browser and add a custom User-Agent
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.binary_location = '/usr/bin/google-chrome'

# Use webdriver-manager to handle ChromeDriver and create the browser instance
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Part 3: The core scraping function - A more robust version
def download_images_from_any_website(page_url, folder, update_progress, total_count_callback):
    driver.get(page_url)
    time.sleep(5)  # Give the page more time to load all content

    # Scroll the entire page to the bottom to trigger lazy loading
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Find image URLs from various elements
    images = driver.find_elements(By.XPATH, "//img | //picture | //div[contains(@style, 'background-image')]")
    img_urls = set()
    for img in images:
        # Check standard image attributes
        src = img.get_attribute("src")
        if src:
            img_urls.add(src)

        # Check srcset (for responsive images)
        srcset = img.get_attribute("srcset")
        if srcset:
            parts = [p.strip() for p in srcset.split(',')]
            for part in parts:
                url_part = part.split()[0]
                if url_part:
                    img_urls.add(url_part)

        # Check data-src and other data attributes
        data_src = img.get_attribute("data-src")
        if data_src:
            img_urls.add(data_src)

        # Check for background images in style attributes using regex
        style = img.get_attribute("style")
        if style and 'background-image' in style:
            try:
                url_match = re.search(r'url\("?\'?(.+?)"?\'?\)', style)
                if url_match:
                    img_urls.add(url_match.group(1))
            except Exception:
                pass

    img_urls = list(img_urls)
    total = len(img_urls)
    total_count_callback(total)

    if total == 0:
        return 0

    count = 0
    for i, url in enumerate(img_urls, start=1):
        try:
            abs_url = urljoin(page_url, url)
            response = requests.get(abs_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                ext = os.path.splitext(abs_url)[1].split('?')[0]
                if ext.lower() not in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
                    ext = '.jpg'
                filename = os.path.join(folder, f"image_{i}{ext}")
                with open(filename, 'wb') as f:
                    f.write(response.content)
                count += 1
        except Exception as e:
            print(f"Failed to download {url}: {e}")
        update_progress(i)
    return count

# Part 4: The part to call the function and handle results (without a GUI)
url = input("Enter The Page URL: ")
folder = "Web_Extracted_images"
if not os.path.exists(folder):
    os.makedirs(folder)

try:
    downloaded = download_images_from_any_website(url, folder, lambda x: None, lambda x: None)
    print(f"Downloaded {downloaded} images to the '{folder}' folder.")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Always quit the driver to free up resources
    driver.quit()
    # Part 5: Zip and download the folder
    print("Zipping images...")
    shutil.make_archive('Web_Extracted_images', 'zip', 'Web_Extracted_images')
    print("Download will start shortly.")
    files.download('Web_Extracted_images.zip')

    # Part 6: Delete the extracted images folder
    print("Cleaning up local files...")
    if os.path.exists(folder):
        shutil.rmtree(folder)
        print(f"The '{folder}' folder has been deleted.")
    else:
        print(f"The '{folder}' folder does not exist.")

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.138.128.112)] [                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.138.128.112)] [                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
0% [Connected to cloud.r-project.org (108.138.128.112)] [Connected to r2u.stat.                                                                               Hit:5 https://cli.github.com/packages stable InRelease
0% [Connected to cloud.r-project.org (108.138.128.112)] [Waiting for headers] [                         

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Cleaning up local files...
The 'Web_Extracted_images' folder has been deleted.


**Excel to Zipped Version**

In [None]:
# ==============================================================
# Part 1: Install Chrome and required libraries
# ==============================================================
!apt-get update -qq
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get install -f -y -qq
!pip install selenium webdriver-manager pandas openpyxl

# ==============================================================
# Part 2: Imports & Setup
# ==============================================================
import os
import time
import requests
import pandas as pd
from urllib.parse import urljoin, urlparse
import shutil
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from google.colab import files

# Headless Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/91.0.4472.124 Safari/537.36")
chrome_options.binary_location = '/usr/bin/google-chrome'

# Chrome driver service
service = ChromeService(ChromeDriverManager().install())

# ==============================================================
# Part 3: Scraping function
# ==============================================================
def download_images_from_any_website(page_url, folder, update_progress, total_count_callback):
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(page_url)
    time.sleep(5)  # allow page to load

    # Scroll to load lazy images
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Collect image URLs
    images = driver.find_elements(By.XPATH, "//img | //picture | //div[contains(@style, 'background-image')]")
    img_urls = set()
    for img in images:
        src = img.get_attribute("src")
        if src and 'data:image' not in src:
            img_urls.add(src)

        srcset = img.get_attribute("srcset")
        if srcset:
            for part in [p.strip() for p in srcset.split(',')]:
                url_part = part.split()[0]
                if url_part:
                    img_urls.add(url_part)

        data_src = img.get_attribute("data-src")
        if data_src:
            img_urls.add(data_src)

        style = img.get_attribute("style")
        if style and 'background-image' in style:
            try:
                url_match = re.search(r'url\("?\'?(.+?)"?\'?\)', style)
                if url_match:
                    img_urls.add(url_match.group(1))
            except:
                pass

    img_urls = list(img_urls)
    total = len(img_urls)
    total_count_callback(total)

    if total == 0:
        driver.quit()
        return 0

    count = 0
    for i, url in enumerate(img_urls, start=1):
        try:
            abs_url = urljoin(page_url, url)
            response = requests.get(abs_url, timeout=10,
                                    headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                ext = os.path.splitext(abs_url)[1].split('?')[0]
                if not ext or ext.lower() not in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
                    ext = '.jpg'
                filename = os.path.join(folder, f"image_{i}{ext}")
                with open(filename, 'wb') as f:
                    f.write(response.content)
                count += 1
        except Exception as e:
            print(f"❌ Failed to download {url}: {e}")
        update_progress(i)

    driver.quit()
    return count

# ==============================================================
# Part 4: Main Execution
# ==============================================================
print("📤 Please upload your Excel file (URLs in first column)...")
uploaded = files.upload()

if not uploaded:
    print("❌ No file uploaded. Exiting.")
else:
    excel_file = next(iter(uploaded))
    print(f"✅ File '{excel_file}' uploaded successfully.")

    try:
        df = pd.read_excel(excel_file, header=None)

        # Group URLs by domain
        domain_urls = {}
        for index, row in df.iterrows():
            url = str(row.iloc[0]).strip()
            if not url or not url.startswith(('http://', 'https://')):
                print(f"⚠️ Skipping invalid URL in row {index+1}: {url}")
                continue
            domain = urlparse(url).netloc
            domain = re.sub(r'[^a-zA-Z0-9]', '_', domain)  # safe folder name
            domain_urls.setdefault(domain, []).append(url)

        # Process each domain separately
        for domain, urls in domain_urls.items():
            print(f"\n🌐 Processing domain: {domain}")
            domain_folder = f"Images_{domain}"
            os.makedirs(domain_folder, exist_ok=True)

            # Download images from each URL of this domain
            for i, url in enumerate(urls, start=1):
                print(f"  🔗 ({i}/{len(urls)}) {url}")
                subfolder = os.path.join(domain_folder, f"url_{i}")
                os.makedirs(subfolder, exist_ok=True)
                downloaded = download_images_from_any_website(
                    url, subfolder, lambda x: None, lambda x: None
                )
                print(f"   ✅ {downloaded} images saved to {subfolder}")

            # Zip this domain’s images
            zip_name = f"{domain}.zip"
            shutil.make_archive(domain, 'zip', domain_folder)
            print(f"📦 Created {zip_name}")

            # Download immediately
            files.download(zip_name)
            print(f"📥 Downloading {zip_name}...")

            # Cleanup
            shutil.rmtree(domain_folder)
            os.remove(zip_name)

        print("\n🎉 All domains processed and downloaded separately!")

    except Exception as e:
        print(f"❌ Error: {e}")
