In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

In [None]:
def get_search_page_urls(search_keyword, page=1):
    """
    获取搜索结果页面的 URL。根据需要处理分页。
    """
    search_url = f"{base_url}/index.php?q=search&keys={search_keyword}&edit%5Btype%5D%5Bbgimage%5D=on&page={page}"
    return search_url

def get_detail_page_links(soup):
    """
    从搜索结果页面提取详细页面的链接。
    """
    detail_links = []
    for img in soup.find_all("img", attrs = {'class':"bgimage-thumb"}):
        parent = img.find_parent("a")
        if parent and parent.get("href"):
            detail_page_url = urljoin(base_url, parent["href"])
            detail_links.append(detail_page_url)
    return detail_links

def get_high_res_image_url(soup):
    """
    从详细页面提取高清图片的 URL。
    根据实际页面结构调整选择器。
    """
    imgs = soup.find_all("img")
    # 根据观察结果，调整提取逻辑
    for img in imgs:
        src = img.get("src", "")
        if "images/cache/" in src or "images/raw/" in src:
            high_res_url = urljoin(base_url, src)
            return high_res_url

    print("未找到高清图片链接")
    return None

def download_image(image_url, save_dir, idx):
    """
    下载图片并保存到指定目录，图片名称按序号命名（例如 aphis_01, aphis_02）。
    """
    try:
        response = requests.get(image_url, headers=headers, stream=True)
        response.raise_for_status()
        # 生成新的图片名称，例如 aphis_001.jpg, aphis_002.jpg
        image_name = f"{search_keyword}_{idx:03d}.jpg"  # 以三位数字格式命名
        # 拼接保存路径
        save_path = os.path.join(save_dir, image_name)
        # 保存图片
        with open(save_path, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"已下载: {image_name}")

    except Exception as e:
        print(f"下载失败 {image_url}: {e}")

def main():
    # 获取搜索结果的第一页
    search_url = get_search_page_urls(search_keyword)
    print(f"请求搜索页面: {search_url}")
    resp = requests.get(search_url, headers=headers)
    if resp.status_code != 200:
        print(f"无法访问搜索页面: 状态码 {resp.status_code}")
        return
    soup = BeautifulSoup(resp.text, "lxml")

    # 提取详细页面链接
    detail_links = get_detail_page_links(soup)
    print(f"找到 {len(detail_links)} 个详细页面链接")

    downloaded_count = 0  # 记录下载的图片数量
    for idx, detail_url in enumerate(detail_links, 1):
        if downloaded_count >= 1000:  # 如果已经下载了500张图片，停止
            print("已下载1000张图片，程序终止。")
            break
        
        try:
            detail_resp = requests.get(detail_url, headers=headers)
            if detail_resp.status_code != 200:
                print(f"无法访问详细页面: 状态码 {detail_resp.status_code}")
                continue
            detail_soup = BeautifulSoup(detail_resp.text, "lxml")
            high_res_url = get_high_res_image_url(detail_soup)
            if high_res_url:
                download_image(high_res_url, download_dir, downloaded_count + 1)  # 使用当前下载的图片数量
                downloaded_count += 1  # 下载数量加1
        except Exception as e:
            print(f"处理详细页面失败: {e}")

        # 为了避免被封，适当休眠
        time.sleep(1)

In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "Aphis"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

def get_search_page_urls(search_keyword, page=1):
    """
    获取搜索结果页面的 URL。根据需要处理分页。
    """
    search_url = f"{base_url}/index.php?q=search&keys={search_keyword}&edit%5Btype%5D%5Bbgimage%5D=on&page={page}"
    return search_url

def get_detail_page_links(soup):
    """
    从搜索结果页面提取详细页面的链接。
    """
    detail_links = []
    for img in soup.find_all("img", attrs = {'class':"bgimage-thumb"}):
        parent = img.find_parent("a")
        if parent and parent.get("href"):
            detail_page_url = urljoin(base_url, parent["href"])
            detail_links.append(detail_page_url)
    return detail_links

def get_high_res_image_url(soup):
    """
    从详细页面提取高清图片的 URL。
    根据实际页面结构调整选择器。
    """

    imgs = soup.find_all("img")
    # print(f"在详细页面中找到 {len(imgs)} 个 <img> 标签")
    for img in imgs:
        src = img.get("src", "")
        # classes = img.get("class", [])
        # img_id = img.get("id", "")
        # print(f"图片 src: {src}, class: {classes}, id: {img_id}")

    # 根据观察结果，调整提取逻辑
    for img in imgs:
        src = img.get("src", "")
        if "images/cache/" in src or "images/raw/" in src:
            high_res_url = urljoin(base_url, src)
            # print(f"找到高清图片链接: {high_res_url}")
            return high_res_url

    print("未找到高清图片链接")
    return None

def download_image(image_url, save_dir, idx):
    """
    下载图片并保存到指定目录，图片名称按序号命名（例如 aphis_01, aphis_02）。
    """
    try:
        response = requests.get(image_url, headers=headers, stream=True)
        response.raise_for_status()
        # 生成新的图片名称，例如 aphis_001.jpg, aphis_002.jpg
        image_name = f"aphis_{idx:03d}.jpg"  # 以三位数字格式命名
        # 拼接保存路径
        save_path = os.path.join(save_dir, image_name)
        # 保存图片
        with open(save_path, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"已下载: {image_name}")

    except Exception as e:
        print(f"下载失败 {image_url}: {e}")

def main():
    # 获取搜索结果的第一页
    search_url = get_search_page_urls(search_keyword)
    print(f"请求搜索页面: {search_url}")
    resp = requests.get(search_url, headers=headers)
    if resp.status_code != 200:
        print(f"无法访问搜索页面: 状态码 {resp.status_code}")
        return
    soup = BeautifulSoup(resp.text, "lxml")

    # 提取详细页面链接
    detail_links = get_detail_page_links(soup)
    print(f"找到 {len(detail_links)} 个详细页面链接")

    for idx, detail_url in enumerate(detail_links, 1):
        # print(f"\n处理详细页面 {idx}/{len(detail_links)}: {detail_url}")
        try:
            detail_resp = requests.get(detail_url, headers=headers)
            if detail_resp.status_code != 200:
                print(f"无法访问详细页面: 状态码 {detail_resp.status_code}")
                continue
            detail_soup = BeautifulSoup(detail_resp.text, "lxml")
            high_res_url = get_high_res_image_url(detail_soup)
            if high_res_url:
                download_image(high_res_url, download_dir, idx)
        except Exception as e:
            print(f"处理详细页面失败: {e}")

        # 为了避免被封，适当休眠
        time.sleep(1)
        
if __name__ == "__main__":
    main()


In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "midge"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

def get_search_page_urls(search_keyword, page=1):
    """
    获取搜索结果页面的 URL。根据需要处理分页。
    """
    search_url = f"{base_url}/index.php?q=search&keys={search_keyword}&edit%5Btype%5D%5Bbgimage%5D=on&page={page}"
    return search_url

def get_detail_page_links(soup):
    """
    从搜索结果页面提取详细页面的链接。
    """
    detail_links = []
    for img in soup.find_all("img", attrs = {'class':"bgimage-thumb"}):
        parent = img.find_parent("a")
        if parent and parent.get("href"):
            detail_page_url = urljoin(base_url, parent["href"])
            detail_links.append(detail_page_url)
    return detail_links

def get_high_res_image_url(soup):
    """
    从详细页面提取高清图片的 URL。
    根据实际页面结构调整选择器。
    """
    imgs = soup.find_all("img")
    # 根据观察结果，调整提取逻辑
    for img in imgs:
        src = img.get("src", "")
        if "images/cache/" in src or "images/raw/" in src:
            high_res_url = urljoin(base_url, src)
            return high_res_url

    print("未找到高清图片链接")
    return None

def download_image(image_url, save_dir, idx):
    """
    下载图片并保存到指定目录，图片名称按序号命名（例如 aphis_01, aphis_02）。
    """
    try:
        response = requests.get(image_url, headers=headers, stream=True)
        response.raise_for_status()
        # 生成新的图片名称，例如 aphis_001.jpg, aphis_002.jpg
        image_name = f"{search_keyword}_{idx:03d}.jpg"  # 以三位数字格式命名
        # 拼接保存路径
        save_path = os.path.join(save_dir, image_name)
        # 保存图片
        with open(save_path, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"已下载: {image_name}")

    except Exception as e:
        print(f"下载失败 {image_url}: {e}")

def main():
    # 获取搜索结果的第一页
    search_url = get_search_page_urls(search_keyword)
    print(f"请求搜索页面: {search_url}")
    resp = requests.get(search_url, headers=headers)
    if resp.status_code != 200:
        print(f"无法访问搜索页面: 状态码 {resp.status_code}")
        return
    soup = BeautifulSoup(resp.text, "lxml")

    # 提取详细页面链接
    detail_links = get_detail_page_links(soup)
    print(f"找到 {len(detail_links)} 个详细页面链接")

    downloaded_count = 0  # 记录下载的图片数量
    for idx, detail_url in enumerate(detail_links, 1):
        if downloaded_count >= 500:  # 如果已经下载了500张图片，停止
            print("已下载500张图片，程序终止。")
            break
        
        try:
            detail_resp = requests.get(detail_url, headers=headers)
            if detail_resp.status_code != 200:
                print(f"无法访问详细页面: 状态码 {detail_resp.status_code}")
                continue
            detail_soup = BeautifulSoup(detail_resp.text, "lxml")
            high_res_url = get_high_res_image_url(detail_soup)
            if high_res_url:
                download_image(high_res_url, download_dir, downloaded_count + 1)  # 使用当前下载的图片数量
                downloaded_count += 1  # 下载数量加1
        except Exception as e:
            print(f"处理详细页面失败: {e}")

        # 为了避免被封，适当休眠
        time.sleep(1)

if __name__ == "__main__":
    main()


In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "Armyworm"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

if __name__ == "__main__":
    main()


In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "Wireworm"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

if __name__ == "__main__":
    main()


In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "Sawfly"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

if __name__ == "__main__":
    main()


In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "Thrips"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

if __name__ == "__main__":
    main()

In [None]:
# 配置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                  'Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}

# 搜索关键词和基础 URL
search_keyword = "Mite"
base_url = "https://bugguide.net"

# 创建用于存储高清图片的目录
download_dir = f"{search_keyword}_images"
os.makedirs(download_dir, exist_ok=True)

if __name__ == "__main__":
    main()