In [5]:
import os
import time
import threading
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

In [7]:
# 설정
SEARCH_KEYWORD = "문상훈"
NUMBER_OF_IMAGES = 20000
THREAD_COUNT = 3
SAVE_DIR = "pinterest_images"

In [10]:
# 검색 변형 리스트

search_variants = [
    SEARCH_KEYWORD,
    #f"{SEARCH_KEYWORD} 밈",
    #f"{SEARCH_KEYWORD} 짤"
]


In [11]:
# 크롤링 함수 정의 


# 저장 폴더 생성
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
    

def download_image(url, save_path, retries=3):
    while retries > 0:
        try:
            response = requests.get(url, stream=True, timeout=10)
            if response.status_code == 200:
                with open(save_path, 'wb') as file:
                    for chunk in response.iter_content(1024):
                        file.write(chunk)
                return True
            else:
                retries -= 1
        except Exception:
            retries -= 1
    return False

# 스크롤해서 크롤링 
def scroll_and_collect_images(search_query, thread_index, progress):
    driver = None
    try:
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        # 로그인 및 검색
        driver.get(f"https://kr.pinterest.com/search/pins/?q={search_query}&rs=typed")

        image_urls = set()
        scroll_pause_time = 1
        retries = 5
        wait = WebDriverWait(driver, 10)
        scroll_attempts = 0
        max_scroll_attempts = 50

        while len(image_urls) < NUMBER_OF_IMAGES // THREAD_COUNT:
            try:
                driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
                time.sleep(scroll_pause_time)
                scroll_attempts += 1
                
                image_elements = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "img")))
                for img_element in image_elements:
                    img_url = img_element.get_attribute("src")
                    if img_url and img_url.startswith("https") and img_url not in image_urls:
                        image_urls.add(img_url)
                        if len(image_urls) >= NUMBER_OF_IMAGES // THREAD_COUNT:
                            break

                if scroll_attempts >= max_scroll_attempts:
                    print(f"Thread {thread_index}: 스크롤 한계에 도달하여 크롤링을 종료합니다.")
                    break
            except StaleElementReferenceException:
                continue
            except Exception as e:
                retries -= 1
                if retries == 0:
                    print(f"Failed to scroll in thread {thread_index} after multiple retries: {e}")
                    break
                else:
                    time.sleep(1)
        
        # 이미지 저장
        for idx, img_url in enumerate(image_urls):
            file_name = f"{search_query.replace(' ', '_')}_{thread_index}_{idx}.jpg"
            save_path = os.path.join(SAVE_DIR, file_name)
            if download_image(img_url, save_path):
                progress[thread_index] += 1
                total_downloaded = sum(progress)
                if total_downloaded % 100 == 0:
                    print(f"Total: {total_downloaded}/{NUMBER_OF_IMAGES} images downloaded.")
                    print(f"Progress: {total_downloaded / NUMBER_OF_IMAGES * 100:.2f}% completed.")
    finally:
        if driver is not None:
            driver.quit()
            
            
            
def start_crawling():
    threads = []
    progress = [0] * THREAD_COUNT

    for search_variant in search_variants:
        for i in range(THREAD_COUNT):
            thread = threading.Thread(
                target=scroll_and_collect_images,
                args=(search_variant, i, progress)
            )
            threads.append(thread)
            thread.start()
    
    for thread in threads:
        thread.join()
    
    print(f"크롤링이 완료되었습니다. 이미지는 '{SAVE_DIR}' 폴더에 저장되었습니다.")

In [12]:
if __name__ == "__main__":
    try:
        start_crawling()
    except ModuleNotFoundError as e:
        print("Module 'webdriver_manager' not found. Please install it using 'pip install webdriver-manager'.")
    except Exception as e:
        print(f"An error occurred: {e}")

Exception in thread Thread-25 (scroll_and_collect_images):
Traceback (most recent call last):
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/3g/tlt8qmcx1nj7kqtx38v8dw380000gn/T/ipykernel_3298/651402953.py", line 32, in scroll_and_collect_images
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/site-packages/selenium/webdriver/chrome/webdriver.py", line 47, in __init__
    super().__init__(
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/site-packages/selenium/webdriver/chromium/webdriver.py", line 58, in __init__
    self.service.start()
  File "/Users/sanbyeol/minifor

Thread 0: 스크롤 한계에 도달하여 크롤링을 종료합니다.
Total: 100/20000 images downloaded.
Progress: 0.50% completed.
Total: 200/20000 images downloaded.
Progress: 1.00% completed.


Service process refused to terminate gracefully with SIGTERM, escalating to SIGKILL.
Traceback (most recent call last):
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/site-packages/selenium/webdriver/common/service.py", line 179, in _terminate_process
    self.process.wait(60)
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/subprocess.py", line 1209, in wait
    return self._wait(timeout=timeout)
  File "/Users/sanbyeol/miniforge3/envs/PJ2_env/lib/python3.10/subprocess.py", line 1951, in _wait
    raise TimeoutExpired(self.args, timeout)
subprocess.TimeoutExpired: Command '['/Users/sanbyeol/.wdm/drivers/chromedriver/mac64/142.0.7444.59/chromedriver-mac-arm64/chromedriver', '--port=50303']' timed out after 60 seconds


크롤링이 완료되었습니다. 이미지는 'pinterest_images' 폴더에 저장되었습니다.
