## Download images at https://azurlane.koumakan.jp/

---
### 【0】首先导入必要的包，定义一些变量

In [1]:
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
headers = {"User-Agent": UserAgent}

Ship_Image_URL = "https://azurlane.koumakan.jp/wiki/List_of_Ships_by_Image"
save_folder = "/kaggle/working/20250328"
os.makedirs(save_folder, exist_ok=True)



---
### 【1】先写一个从Ship_Image_URL获取所有Ship的Gallery的绝对网址

In [2]:
def get_ship_links():
    
    main_url = "https://azurlane.koumakan.jp/wiki/List_of_Ships_by_Image"
    base_url = "https://azurlane.koumakan.jp"  # 用于拼接完整 URL
    gall_url = "/Gallery"

    try:
        response = requests.get(main_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        i1 = 0
        i2 = 0
        i3 = 0
        ship_names = []
        ship_links = []
        
        # 找到所有 <div class="azl-shipcard">
        for ship_card in soup.find_all("div", class_="azl-shipcard"):
            i1 += 1
            # 在 ship_card 下找到 <div class="alc-top chuncate"> 里的 <a>
            alc_top = ship_card.find("div", class_="alc-top truncate")
            if alc_top:
                i2 += 1
                a_tag = alc_top.find("a")
                if a_tag and a_tag.get("href"):
                    i3 += 1
                    link = f"{base_url}{a_tag['href']}{gall_url}"
                    ship_links.append(link)
                    ship_names.append(a_tag.get_text(strip=True))

        print(f"共有{i1}个符合特征 <div class='azl-shipcard'>")
        print(f"共有{i2}个符合特征 <div class='azl-shipcard'> && <a>")
        print(f"共有{i3}个符合特征 <div class='azl-shipcard'> && <a> && get('href')")
        return ship_names, ship_links
    
    except Exception as e:
        print(f"x 提取舰船链接失败: {e}")
        return [], []



---
### 【2】然后实现从某Ship的Gallery得到所有Skin的wiki下载链接
- 注：提取full_url与image_name时可能会有重复项（且两者会同时与前面提取的元素重复），一个折中的办法是消除重复项，如下函数所示

In [3]:
# # 旧版本，无禁用词，对Dafault&Censored嵌套无法处理
# def get_image_links(url, base_name=None):  # todo
#     # 确定基础名称
#     if base_name is None:
#         parsed_url = urlparse(url)
#         path_segments = [p for p in parsed_url.path.split('/') if p]
#         base_name = path_segments[-2] if len(path_segments) >= 2 else 'Unknown'
    
#     # 发送HTTP请求
#     response = requests.get(url)
#     response.raise_for_status()
    
#     # 解析HTML内容
#     soup = BeautifulSoup(response.content, 'html.parser')
    
#     # 初始化结果列表
#     url_list = []
#     name_list = []
    
#     # 用于提取根名的正则表达式
#     pattern = re.compile(r'^tabber-tab-(.+?)-\d+$')
    
#     # 查找所有article标签
#     for article in soup.find_all('article', class_='tabber__panel'):
#         # 提取aria-labelledby属性
#         aria_label = article.get('aria-labelledby', '')
        
#         # 使用正则匹配根名
#         match = pattern.match(aria_label)
#         if not match:
#             continue
#         root_name = match.group(1)
        
#         # 处理两种类型的div
#         for div_type in ['shipskin-chibi', 'shipskin-image']:
#             div = article.find('div', class_=div_type)
#             if not div:
#                 continue
                
#             # 提取链接
#             link = div.find('a')
#             if not link or 'href' not in link.attrs:
#                 continue
                
#             # 构建完整URL
#             full_url = urljoin(url, link['href'])
            
#             # 构建图片名称
#             suffix = '_Chibi' if div_type == 'shipskin-chibi' else ''
#             image_name = f"{base_name}_{root_name}{suffix}"
            
#             # 添加到结果列表
#             name_list.append(image_name)
#             url_list.append(full_url)
    
#     return name_list, url_list


def get_image_links(url, base_name=None):
    # 获取基础名称
    if base_name is None:
        parsed = urlparse(url)
        path_segments = [p for p in parsed.path.split('/') if p]
        base_name = path_segments[-2] if len(path_segments) >= 2 else 'Unknown'
    
    # 获取并解析网页
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    image_urls = []
    image_names = []
    last_valid_root = None  # 用于处理Default逻辑
    
    # 遍历所有article标签
    for article in soup.find_all('article', class_='tabber__panel'):
        aria_label = article.get('aria-labelledby', '')
        
        # 跳过包含禁用词的标签
        if any(banned in aria_label for banned in ['Without_BG', 'Censored', 'EN']):
            continue
            
        # 解析aria-labelledby属性
        parts = aria_label.split('-')
        if len(parts) < 4 or not parts[-1].isdigit():
            continue  # 不符合格式要求
            
        root_part = '-'.join(parts[2:-1])  # 提取子图名称部分
        k_value = parts[-1]  # 提取k值
        
        # 处理HTML实体转义（如&#039;转'）
        root_part = BeautifulSoup(root_part, 'html.parser').get_text()
        
        # 处理Default逻辑
        if root_part == "Default":
            if k_value == "0":
                current_root = "Default"
                last_valid_root = current_root  # 记录但不强制使用
            else:
                current_root = last_valid_root or "Default"  # 优先使用上次有效名称
        else:
            current_root = root_part
            last_valid_root = current_root  # 更新有效名称
        
        # 处理两种图片容器
        for div_class in ['shipskin-chibi', 'shipskin-image']:
            container = article.find('div', class_=div_class)
            if not container:
                continue
                
            # 提取图片链接
            link = container.find('a', href=True)
            if not link:
                continue
                
            # 构建完整URL
            full_url = urljoin(url, link['href'])
            
            # 构建图片名称
            suffix = '_Chibi' if div_class == 'shipskin-chibi' else ''
            image_name = f"{base_name}_{current_root}{suffix}"
            
            # 清理特殊字符（保留字母数字、下划线、连字符、问号）
            image_name = re.sub(r'[^\w\-?]', '_', image_name)  # 替换非法字符
            image_name = re.sub(r'_+', '_', image_name)        # 合并连续下划线

            if (full_url not in image_urls) and (image_name not in image_names):  # 两者全不相同时满足
                image_names.append(image_name)
                image_urls.append(full_url)
    
    return image_names, image_urls


# 测试函数功能：
gallery_url = "https://azurlane.koumakan.jp/wiki/Helena/Gallery"  # Helena: 6 x 2
gallery_url = "https://azurlane.koumakan.jp/wiki/Belfast/Gallery"  # Belfast: 9 x 2
links, names = get_image_links(gallery_url)
print(f"找到 {len(links)} 个子图， {len(names)} 个名称\n")

for name, link in zip(names, links):
    print(name, link)


找到 18 个子图， 18 个名称

https://azurlane.koumakan.jp/wiki/File:BelfastChibi.png Belfast_Default_Chibi
https://azurlane.koumakan.jp/wiki/File:Belfast.png Belfast_Default
https://azurlane.koumakan.jp/wiki/File:BelfastWeddingChibi.png Belfast_Wedding_Chibi
https://azurlane.koumakan.jp/wiki/File:BelfastWedding.png Belfast_Wedding
https://azurlane.koumakan.jp/wiki/File:BelfastSpringChibi.png Belfast_Iridescent_Rosa_Chibi
https://azurlane.koumakan.jp/wiki/File:BelfastSpring.png Belfast_Iridescent_Rosa
https://azurlane.koumakan.jp/wiki/File:BelfastBlurayChibi.png Belfast_Serene_Steel_Chibi
https://azurlane.koumakan.jp/wiki/File:BelfastBluray.png Belfast_Serene_Steel
https://azurlane.koumakan.jp/wiki/File:BelfastPartyChibi.png Belfast_The_Noble_Attendant_Chibi
https://azurlane.koumakan.jp/wiki/File:BelfastParty.png Belfast_The_Noble_Attendant
https://azurlane.koumakan.jp/wiki/File:BelfastCasualChibi.png Belfast_Shopping_with_the_Head_Maid_Chibi
https://azurlane.koumakan.jp/wiki/File:BelfastCasual.p

---
### 【3】最后需要写一个函数实现从wiki上下载原图

In [4]:
# # 旧版本，下载时可能会报错：Connection broken: IncompleteRead
# def download_wiki_image_fullsize(file_page_url, save_path):  # todo
#     """
#     下载维基文件页面的原始尺寸图片（兼容最新BeautifulSoup版本）
    
#     参数:
#         file_page_url: 文件页面URL 
#         save_path: 图片保存路径
#     """
#     try:
#         # 1. 获取文件页面
#         response = requests.get(file_page_url, headers=headers)
#         response.raise_for_status()
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         # 2. 查找原始文件链接（新版写法）
#         full_res_link = None
#         for a_tag in soup.find_all("a"):
#             # 中文维基"原始文件"或英文维基"Original file"
#             if a_tag.string == "原始文件" or a_tag.string == "Original file":
#                 full_res_link = a_tag
#                 break
        
#         if not full_res_link:
#             raise ValueError("找不到原始文件链接")
        
#         # 3. 获取原始图片URL
#         original_img_url = urljoin(file_page_url, full_res_link["href"])
        
#         # 4. 下载并保存
#         os.makedirs(os.path.dirname(save_path), exist_ok=True)
#         img_data = requests.get(original_img_url, headers=headers).content
        
#         with open(save_path, "wb") as f:
#             f.write(img_data)
        
#         print(f"✓ 原始尺寸图片已保存到: {save_path}")
#         return True
    
#     except Exception as e:
#         print(f"× 下载失败: {str(e)}")
#         return False


def download_wiki_image_fullsize(file_page_url, save_path, max_retries=3):
    """
    下载维基文件页面的原始尺寸图片（兼容最新BeautifulSoup版本）
    
    参数:
        file_page_url: 文件页面URL 
        save_path: 图片保存路径
        max_retries: 最大重试次数 (默认3次)
    """
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            # 1. 获取文件页面
            response = requests.get(file_page_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 2. 查找原始文件链接（新版写法）
            full_res_link = None
            for a_tag in soup.find_all("a"):
                # 中文维基"原始文件"或英文维基"Original file"
                if a_tag.string and (a_tag.string.strip() == "原始文件" or a_tag.string.strip() == "Original file"):
                    full_res_link = a_tag
                    break
            
            if not full_res_link:
                print(f"ERROR: 找不到原始文件链接: {file_page_url}")
                break
            
            # 3. 获取原始图片URL
            original_img_url = urljoin(file_page_url, full_res_link["href"])
            
            # 4. 下载并保存
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            
            # 使用流式下载大文件
            with requests.get(original_img_url, headers=headers, stream=True, timeout=30) as img_response:
                img_response.raise_for_status()
                
                # 检查内容长度
                total_size = int(img_response.headers.get('content-length', 0))
                downloaded_size = 0
                
                with open(save_path, "wb") as f:
                    for chunk in img_response.iter_content(chunk_size=8192):
                        if chunk:  # 过滤掉保持连接的新块
                            f.write(chunk)
                            downloaded_size += len(chunk)
                
                # 验证下载完整性
                if total_size > 0 and downloaded_size != total_size:
                    raise IOError(f"WARNING: 下载不完整 (预期: {total_size}, 实际: {downloaded_size} 字节)")
            
            print(f"✓ 原始尺寸图片已保存到: {save_path}")
            return True
            
        except requests.exceptions.RequestException as e:
            retry_count += 1
            print(f"× 下载失败 (尝试 {retry_count}/{max_retries}): {str(e)}")
            if retry_count < max_retries:
                time.sleep(0.1 * 2 ** retry_count)  # 指数退避
            else:
                print(f"× 达到最大重试次数，放弃下载: {file_page_url}")
                return False
                
        except Exception as e:
            print(f"× 发生错误: {str(e)}")
            return False
            

# 使用示例
download_wiki_image_fullsize(
    "https://azurlane.koumakan.jp/wiki/File:Specialized_Bulin_Custom_MKIII.png",
    save_folder + "/test_3/Bulin_MKIII.png"
)
download_wiki_image_fullsize(
    "https://azurlane.koumakan.jp/wiki/File:HelenaPartyChibi.png",
    save_folder + "/test_3/HelenaPartyChibi.png"
)


✓ 原始尺寸图片已保存到: /kaggle/working/20250328/test_3/Bulin_MKIII.png
× 找不到原始文件链接: https://azurlane.koumakan.jp/wiki/File:HelenaPartyChibi.png


---
### 【4】函数已经定义完成，开始准备下载：

#### 【4.1】get_ship_links函数得到所有Ship的Gallery路径

In [5]:
ship_names, ship_links = get_ship_links()
print(f"√ 找到 {len(ship_names)} 个Ship链接")
# print(ship_links)

# 输出前几个提取的Ship Gallery URL看看效果~
for i in range(10):
    print(f"Ship: {ship_names[i]}: {ship_links[i]}")


共有873个符合特征 <div class='azl-shipcard'>
共有873个符合特征 <div class='azl-shipcard'> && <a>
共有873个符合特征 <div class='azl-shipcard'> && <a> && get('href')
✅ 找到 873 个Ship链接
🚢 Universal Bulin: https://azurlane.koumakan.jp/wiki/Universal_Bulin/Gallery
🚢 Prototype Bulin MKII: https://azurlane.koumakan.jp/wiki/Prototype_Bulin_MKII/Gallery
🚢 Specialized Bulin Custom MKIII: https://azurlane.koumakan.jp/wiki/Specialized_Bulin_Custom_MKIII/Gallery
🚢 Dewey: https://azurlane.koumakan.jp/wiki/Dewey/Gallery
🚢 Cassin: https://azurlane.koumakan.jp/wiki/Cassin/Gallery
🚢 Downes: https://azurlane.koumakan.jp/wiki/Downes/Gallery
🚢 Gridley: https://azurlane.koumakan.jp/wiki/Gridley/Gallery
🚢 Craven: https://azurlane.koumakan.jp/wiki/Craven/Gallery
🚢 McCall: https://azurlane.koumakan.jp/wiki/McCall/Gallery
🚢 Maury: https://azurlane.koumakan.jp/wiki/Maury/Gallery


#### 【4.2】get_image_links函数得到Ship的所有Skin的wiki下载路径

- 包括shipskin-chibi和shipskin-image
- 不含Without_BG，Censored和EN

In [6]:
image_info_dict = dict()

# 再提取前几个Ship的所有Skin的wiki URL看看效果~
for i in range(5):
    image_names, image_urls = get_image_links(url=ship_links[i])
    image_info_dict[ship_names[i]] = list(zip(image_names, image_urls))

import yaml
print(yaml.dump(image_info_dict, allow_unicode=True, default_flow_style=False))

Cassin:
- !!python/tuple
  - Cassin_Default_Chibi
  - https://azurlane.koumakan.jp/wiki/File:CassinChibi.png
- !!python/tuple
  - Cassin_Default
  - https://azurlane.koumakan.jp/wiki/File:Cassin.png
- !!python/tuple
  - Cassin_Retrofit_Chibi
  - https://azurlane.koumakan.jp/wiki/File:CassinKaiChibi.png
- !!python/tuple
  - Cassin_Retrofit
  - https://azurlane.koumakan.jp/wiki/File:CassinKai.png
- !!python/tuple
  - Cassin_Shopping_Carte_Blanche_Chibi
  - https://azurlane.koumakan.jp/wiki/File:CassinSchoolChibi.png
- !!python/tuple
  - Cassin_Shopping_Carte_Blanche
  - https://azurlane.koumakan.jp/wiki/File:CassinSchool.png
Dewey:
- !!python/tuple
  - Dewey_Default_Chibi
  - https://azurlane.koumakan.jp/wiki/File:DeweyChibi.png
- !!python/tuple
  - Dewey_Default
  - https://azurlane.koumakan.jp/wiki/File:Dewey.png
- !!python/tuple
  - Dewey_Summer_Longing_Chibi
  - https://azurlane.koumakan.jp/wiki/File:DeweySummerChibi.png
- !!python/tuple
  - Dewey_Summer_Longing
  - https://azurlane.

#### 【4.3】再加上download_wiki_image_fullsize下载函数就可以下载啦~
- 这里就不再尝试了

In [7]:
# idx = 0
# idx_max = 10

# for key in image_info_dict.keys():
#     idx += 1
#     dl_ship_dir = save_folder + f"test_4/{key}"
    
#     for skin_tuple in image_info_dict[key]:
#         skin_name, skin_url = skin_tuple
#         download_wiki_image_fullsize(file_page_url=skin_url, save_path=dl_ship_dir + '/' + skin_name)
    
#     if idx >= idx_max:
#         break



---
### 【5】最后，借助两个大循环，与前面得到的ship_links和ship_names，完整版的代码如下

In [8]:
fig_folder = save_folder + "/figs"

ship_num = len(ship_links)

image_info_dict = dict()

for i in range(ship_num):
    image_names, image_urls = get_image_links(url=ship_links[i])
    image_info_dict[ship_names[i]] = list(zip(image_names, image_urls))

for key in image_info_dict.keys():
    dl_ship_dir = fig_folder + f"/{key}"
    
    for skin_tuple in image_info_dict[key]:
        skin_name, skin_url = skin_tuple
        download_wiki_image_fullsize(file_page_url=skin_url, save_path=dl_ship_dir + '/' + skin_name + '.png')

    # print(f"Complete downloading images for ship: {image_info_dict[key]}")



× 找不到原始文件链接: https://azurlane.koumakan.jp/wiki/File:Universal_BulinChibi.png
✓ 原始尺寸图片已保存到: /kaggle/working/20250328/figs/Universal Bulin/Universal_Bulin_Default.png
× 找不到原始文件链接: https://azurlane.koumakan.jp/wiki/File:Universal_BulinEventChibi.png
✓ 原始尺寸图片已保存到: /kaggle/working/20250328/figs/Universal Bulin/Universal_Bulin_Universal_Enhanced_Rigging_Experimental_α.png
× 找不到原始文件链接: https://azurlane.koumakan.jp/wiki/File:Prototype_Bulin_MKIIChibi.png
✓ 原始尺寸图片已保存到: /kaggle/working/20250328/figs/Prototype Bulin MKII/Prototype_Bulin_MKII_Default.png
× 找不到原始文件链接: https://azurlane.koumakan.jp/wiki/File:Prototype_Bulin_MKIIEventChibi.png
✓ 原始尺寸图片已保存到: /kaggle/working/20250328/figs/Prototype Bulin MKII/Prototype_Bulin_MKII_Prototype_Advanced_Weaponry_Prop_.png
× 找不到原始文件链接: https://azurlane.koumakan.jp/wiki/File:Specialized_Bulin_Custom_MKIIIChibi.png
✓ 原始尺寸图片已保存到: /kaggle/working/20250328/figs/Specialized Bulin Custom MKIII/Specialized_Bulin_Custom_MKIII_Default.png
× 找不到原始文件链接: https://azurlane.