In [3]:
!pip install requests
!pip install beautifulsoup4



In [4]:
import requests, time, os
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [17]:
BASE_URL = "https://diatoms.org"
SPECIES_LIST_URL = "https://diatoms.org/species"

SAVE_FOLDER = 'diatom_dataset'
os.makedirs(SAVE_FOLDER, exist_ok=True)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    response = requests.get(SPECIES_LIST_URL, headers=headers)
    response.raise_for_status()
    print("웹사이트 접속 성공")

except requests.exceptions.RequestException as e:
    print(f"웹사이트 접속 실패: {e}")
    exit()

웹사이트 접속 성공


In [18]:
soup = BeautifulSoup(response.text, 'html.parser')

genus_links = soup.select('#genera-list h2 a')

print(f"\n총 {len(genus_links)}개의 속(Genus)을 찾았습니다.")

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

for genus_link in genus_links:
  relative_path = genus_link['href']

  if '/genera' in relative_path:
    genus_name = genus_link.text.strip()
    full_url = urljoin(BASE_URL, relative_path)

    print(f"이름: {genus_name}, 링크: {relative_path}")

    try:
      genus_page_response = requests.get(full_url, headers=headers)
      genus_page_response.raise_for_status()
    except requests.exceptions.RequestException as e:
      print(f"페이지에 접속 실패: {e}")
      continue

    time.sleep(1)

    genus_soup = BeautifulSoup(genus_page_response.text, 'html.parser')

    species_links = genus_soup.select('.taxa-grid a[href*="/species/"]')

    print(f" -> 총 {len(species_links)//2}개의 종(Species)을 찾았습니다.")


    for species_link in species_links:
            species_name = species_link.text.strip()
            species_relative_path = species_link['href']
            species_full_url = urljoin(BASE_URL, species_relative_path)

            if not species_name:
              continue
            print(f"  종 이름: {species_name}, 링크: {species_full_url}")

            try:
              species_page_response = requests.get(species_full_url, headers=headers)
              species_page_response.raise_for_status()

              species_soup = BeautifulSoup(species_page_response.text, 'html.parser')

              image_tags = species_soup.select('.image-set img')

              if not image_tags:
                  print("    -> 이미지를 찾지 못했습니다.")
                  continue

              print(f"    -> {len(image_tags)}개의 이미지 다운로드 시작...")
              for image_tag in image_tags:

                img_src = image_tag.get('src')
                if not img_src:
                    continue

                img_full_url = urljoin(BASE_URL, img_src)

                # 파일 이름: 종_이름_원본파일이름
                original_filename = os.path.basename(img_src)
                save_path = os.path.join(SAVE_FOLDER, f"{species_name}_{original_filename}")

                img_response = requests.get(img_full_url, headers=headers)
                img_response.raise_for_status()
                with open(save_path, 'wb') as file:
                    file.write(img_response.content)
                print(f"      -> 저장 완료: {save_path}")

            except requests.exceptions.RequestException as e:
              print(f"  -> 종 페이지({species_name}) 처리 중 에러: {e}")

            time.sleep(1)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
    -> 7개의 이미지 다운로드 시작...
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica1.jpg
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica2.jpg
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica3.jpg
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica4.jpg
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica4L.jpg
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica5.jpg
      -> 저장 완료: diatom_dataset/Mastogloia elliptica_Maselliptica6.jpg
  종 이름: Mastogloia floridensis, 링크: https://diatoms.org/species/301254/mastogloia-floridensis
    -> 12개의 이미지 다운로드 시작...
      -> 저장 완료: diatom_dataset/Mastogloia floridensis_Mastogloia-floridensis-LM3.jpg
      -> 저장 완료: diatom_dataset/Mastogloia floridensis_Mastogloia-floridensis-LM4.jpg
      -> 저장 완료: diatom_dataset/Mastogloia floridensis_Mastogloia-floridensis-LM7.jpg
      -> 저장 완료: diatom_dataset/Mastogloia floridensis_Mas

In [13]:
!rm -rf diatom_dataset

In [19]:
import shutil
from google.colab import files

folder_to_zip = 'diatom_dataset'

output_filename = 'diatom_dataset_archive'

shutil.make_archive(output_filename, 'zip', folder_to_zip)

files.download(f'{output_filename}.zip')

print(f"'{output_filename}.zip' 파일 다운로드")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'diatom_dataset_archive.zip' 파일 다운로드
