In [1]:
import requests
import json
import os
from tqdm import tqdm  # 导入 tqdm 库
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# JSON文件路径
jsonDir = "/home/ubuntu/crawler/products_info.json"

In [3]:
# 读取JSON文件
with open(jsonDir, "r", encoding='utf-8') as json_file:
    ReadJson = json.load(json_file)

In [4]:
# 下载图片的函数
def download_image(product_id, img_url, index):
    response = requests.get(img_url)
    if response.status_code == 200:
        file_name = f"{product_id}_{index + 1}.jpg"

        # 图片存储目录
        image_dir = f"/home/ubuntu/crawler/info/{product_id}/image"
        # 创建目录如果不存在
        os.makedirs(image_dir, exist_ok=True)

        file_path = os.path.join(image_dir, file_name)
        with open(file_path, "wb") as img_file:
            img_file.write(response.content)
        return f"下载成功: {file_name}"
    else:
        return f"无法下载图片: {img_url}，状态码: {response.status_code}"

In [5]:
# 收集所有图片下载任务
download_tasks = []
for product_id, product_info in ReadJson.items():
    images = product_info.get("additionalImageUrls", [])
    for idx, img_url in enumerate(images):
        download_tasks.append((product_id, img_url, idx))

In [6]:
# 使用线程池下载图片并显示进度条
with ThreadPoolExecutor() as executor:
    with tqdm(total=len(download_tasks), desc="下载进度") as pbar:
        future_to_image = {executor.submit(download_image, *task): task for task in download_tasks}
        
        for future in as_completed(future_to_image):
            result = future.result()
            pbar.update(1)  # 更新进度条

下载进度: 100%|█████████████████████████████| 4190/4190 [03:10<00:00, 21.94it/s]
