In [3]:
import os
import csv
import shutil
import cloudscraper
import re

# ─── CONFIG ───────────────────────────────────────────────
CSV_FILE = "final.csv"
IMAGE_ROOT = "images"

# ─── Initialize Cloudflare Scraper ────────────────────────
scraper = cloudscraper.create_scraper()

# ─── Create image root folder ─────────────────────────────
os.makedirs(IMAGE_ROOT, exist_ok=True)

# ─── Function to sanitize folder names ─────────────────────
def sanitize_filename(name):
    return re.sub(r'[<>:"/\\|?*]', '_', name.strip())

# ─── Function to download images bypassing Cloudflare ─────
def download_image(url, save_path):
    try:
        if not url.lower().endswith((".jpg", ".jpeg", ".png")):
            print(f"⏩ Skipped non-image URL: {url}")
            return

        if os.path.exists(save_path):
            print(f"⏩ Already exists: {save_path}")
            return

        response = scraper.get(url, stream=True, timeout=15)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                shutil.copyfileobj(response.raw, f)
            print(f"✅ Saved: {save_path}")
        else:
            print(f"❌ Failed: {url} → {response.status_code}")
    except Exception as e:
        print(f"❌ Error downloading {url}: {e}")

# ─── Process each row in the CSV ──────────────────────────
with open(CSV_FILE, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        brand = sanitize_filename(row["Brand"]).replace(" ", "_")
        product = sanitize_filename(row["ProductName"]).replace(" ", "")
        folder_path = os.path.join(IMAGE_ROOT, brand, product)
        os.makedirs(folder_path, exist_ok=True)

        # ─── Download main images ─────
        image_urls = row["ImageURLs"].split(";") if row.get("ImageURLs") else []
        for idx, url in enumerate(image_urls):
            if url.strip():
                filename = f"main_{idx+1}.jpg"
                save_path = os.path.join(folder_path, filename)
                download_image(url.strip(), save_path)

        # ─── Download dimension images ─────
        dim_urls = row["DimensionImages"].split(";") if row.get("DimensionImages") else []
        for idx, url in enumerate(dim_urls):
            if url.strip():
                filename = f"dimension_{idx+1}.jpg"
                save_path = os.path.join(folder_path, filename)
                download_image(url.strip(), save_path)


⏩ Already exists: images\Ditre_Italia\BABEL\main_1.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_2.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_3.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_4.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_5.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_6.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_7.jpg
⏩ Already exists: images\Ditre_Italia\BABEL\main_8.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_1.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_2.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_3.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_4.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_5.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_6.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_7.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_1.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_2.jpg
⏩ Already exists: images\Ditre_Italia\ALTA\main_3.jpg
⏩ Already exists: im

In [7]:
import os
import csv
import shutil
import cloudscraper
import re

# ─── CONFIG ───────────────────────────────────────────────
CSV_FILE = "scraping_url_final.csv"
IMAGE_ROOT = "images"
REQUEST_TIMEOUT = 15

# ─── Initialize Cloudflare Scraper ────────────────────────
scraper = cloudscraper.create_scraper()

# ─── Sanitize folder/file names ───────────────────────────
def sanitize_filename(name):
    if name is None:
        return "Unknown"
    return re.sub(r'[<>:"/\\|?*]', '_', str(name).strip()).replace(" ", "_") or "Unknown"

# ─── Helpers ──────────────────────────────────────────────
ALLOWED_SUFFIXES = (".jpg", ".jpeg", ".png", ".webp")
ALLOWED_CT_SUBSTR = ("image/jpeg", "image/jpg", "image/png", "image/webp")

def split_urls(s):
    """Support ';' or '|' separated URL lists; trims blanks and dedupes."""
    if not s:
        return []
    parts = re.split(r"[;|]", s)
    seen, out = set(), []
    for p in (p.strip() for p in parts):
        if p and p not in seen:
            seen.add(p)
            out.append(p)
    return out

# ─── Download image bypassing Cloudflare (SAVE AS .JPG) ───
def download_image(url, save_path):
    try:
        url = (url or "").strip()
        if not url:
            return

        # only attempt if URL ends with an allowed image suffix
        if not url.lower().endswith(ALLOWED_SUFFIXES):
            print(f"⏩ Skipped non-image URL: {url}")
            return

        # FORCE .jpg save (like your previous code)
        base, _ = os.path.splitext(save_path)
        save_path = base + ".jpg"

        if os.path.exists(save_path):
            print(f"⏩ Already exists: {save_path}")
            return

        response = scraper.get(url, stream=True, timeout=REQUEST_TIMEOUT)
        if response.status_code != 200:
            print(f"❌ Failed: {url} → {response.status_code}")
            return

        # Safety: ensure the server actually returned an image
        content_type = (response.headers.get("Content-Type") or "").lower()
        if not any(ct in content_type for ct in ALLOWED_CT_SUBSTR):
            print(f"❌ Skipped unsupported content: {url} ({content_type})")
            return

        # Atomic write to avoid partial/corrupt files
        tmp_path = save_path + ".part"
        with open(tmp_path, 'wb') as f:
            shutil.copyfileobj(response.raw, f)
        os.replace(tmp_path, save_path)

        print(f"✅ Saved: {save_path}")

    except Exception as e:
        print(f"❌ Error downloading {url}: {e}")

# ─── Check if all expected images exist ───────────────────
def all_images_downloaded(folder, num_main, num_dim):
    # since we always save as .jpg, just count .jpg files with the right prefixes
    def count_present(prefix):
        return sum(
            1 for f in os.listdir(folder)
            if f.lower().startswith(prefix + "_") and f.lower().endswith(".jpg")
        )
    return count_present("main") >= num_main and count_present("dimension") >= num_dim

# ─── Main Execution ───────────────────────────────────────
with open(CSV_FILE, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)

    for row in reader:
        brand   = sanitize_filename(row.get("Brand"))
        product = sanitize_filename(row.get("ProductName"))
        category= sanitize_filename(row.get("Category"))
        type_   = sanitize_filename(row.get("Type"))

        # Final folder structure: Brand/Product/Category/Type
        folder_path = os.path.join(IMAGE_ROOT, brand, product, category, type_)
        os.makedirs(folder_path, exist_ok=True)

        # Support ';' or '|' separated lists; trim & dedupe
        image_urls = split_urls(row.get("ImageURLs"))
        dim_urls   = split_urls(row.get("DimensionImages"))

        if all_images_downloaded(folder_path, len(image_urls), len(dim_urls)):
            print(f"🔁 Skipped (all images already downloaded): {brand}/{product}/{category}/{type_}")
            continue

        print(f"\n📂 Processing: {brand}/{product}/{category}/{type_}")

        # ─── Download main images ─────
        for idx, url in enumerate(image_urls):
            if url:
                filename = f"main_{idx+1}.jpg"   # final saved name will be .jpg
                save_path = os.path.join(folder_path, filename)
                download_image(url, save_path)

        # ─── Download dimension images ─────
        for idx, url in enumerate(dim_urls):
            if url:
                filename = f"dimension_{idx+1}.jpg"  # final saved name will be .jpg
                save_path = os.path.join(folder_path, filename)
                download_image(url, save_path)


🔁 Skipped (all images already downloaded): Ditre_Italia/BABEL/Coffee_tables/Round_wood_and_metal_coffee_table
🔁 Skipped (all images already downloaded): Ditre_Italia/ALTA/Armchairs/Fabric_armchair_with_armrests
🔁 Skipped (all images already downloaded): Ditre_Italia/ALTA/Sofas/Fabric_sofa
🔁 Skipped (all images already downloaded): Ditre_Italia/DOON/Tables/Dining_table
🔁 Skipped (all images already downloaded): Ditre_Italia/LUCY/Chairs/Fabric_chair
🔁 Skipped (all images already downloaded): Ditre_Italia/DAILY/Sofas/Fabric_sofa
🔁 Skipped (all images already downloaded): Ditre_Italia/DAILY/Sofas/Leather_sofa
🔁 Skipped (all images already downloaded): Ditre_Italia/LUNA/Armchairs/High-back_leather_armchair
🔁 Skipped (all images already downloaded): Ditre_Italia/LUNA/Armchairs/High-back_fabric_armchair
🔁 Skipped (all images already downloaded): Ditre_Italia/LUNA/Armchairs/Fabric_armchair_with_armrests
🔁 Skipped (all images already downloaded): Ditre_Italia/LUVON/Sofas/Modular_fabric_sofa
🔁 S