In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
from pixiv_crawler import download_imgs_from_artwork_id
import threading

In [None]:
def scroll_down(driver, n_times):
    actions = ActionChains(driver)
    
    for _ in range(n_times):
        actions.scroll_by_amount(0, 1000).perform()  # scroll down by 1000 pixels
        time.sleep(0.3)

def login(driver, email, password):
    email_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder='E-mail address or pixiv ID']")
    email_input.send_keys(email)
    time.sleep(1)
    password_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder='Password']")
    password_input.send_keys(password)
    time.sleep(1)
    password_input.send_keys(Keys.ENTER)

def get_artwork_ids(driver):
    ids = []

    artworks = driver.find_elements(By.XPATH, "//a[contains(@href, '/en/artworks/')]")
    for artwork in artworks:
        id = artwork.get_attribute("data-gtm-value")
        if id is not None:
            ids.append(id)

    return ids

def read_account_info(path):
    info = {}
    
    with open(path) as file:
        lines = file.readlines()

        for line in lines:
            temp = line.split("=")
            key = temp[0].strip()
            value = temp[1].strip()

            info[key] = value

    return info

# Crawl

In [None]:
account_info = read_account_info("account.txt")
email = account_info["email"]
password = account_info["password"]
nickname = account_info["nickname"]

In [None]:
url = ""

# set up Chrome WebDriver
# headless mode is inconsistent
driver = uc.Chrome(use_subprocess=False)
driver.maximize_window()

driver.get(url)

# LOGIN
# find and click the login link
time.sleep(1)
login_link = driver.find_element(By.XPATH, "//a[contains(@href, '/login.php')]")
login_link.click()
time.sleep(1)

print("Logging in...", end="")
login(driver, email, password)
print("Success")

# reload the original page since login redirects us to the first page
time.sleep(2) # brief pause before proceeding
driver.get(url)

# RETRIEVE ALL ARTWORKS IDs ON THIS PAGE
# wait until the profile avatar is present
WebDriverWait(driver, 7).until(
    EC.presence_of_element_located((By.XPATH, f"//div[@title='{nickname}']"))
)
print("Finding all artworks IDs...")

# scroll_down to the bottom of the page to load all artworks' elements
scroll_down(driver, n_times=10) # more human-like than using JS

# extract artwork IDs from the loaded page
artwork_ids = get_artwork_ids(driver)

print(f"Found {len(artwork_ids)} artworks:")
print(artwork_ids)

driver.quit()

In [None]:
# artwork_ids = 
n_threads = 8
folder_path = ""

failed_imgs = []
failed_imgs_lock = threading.Lock()

def download_with_thread(artwork_ids_part):
    global failed_imgs
    for artwork_id in artwork_ids_part:
        _failed_imgs = download_imgs_from_artwork_id(artwork_id, folder_path)

        if _failed_imgs:
            with failed_imgs_lock:
                failed_imgs.extend(_failed_imgs)

artwork_chunks = [artwork_ids[i::n_threads] for i in range(n_threads)]

# create threads for each chunk
threads = []
for chunk in artwork_chunks:
    thread = threading.Thread(target=download_with_thread, args=(chunk,))
    threads.append(thread)
    thread.start()

# wait for all threads to complete
for thread in threads:
    thread.join()

# report any failures
if failed_imgs:
    failed_imgs = set(int(failed_img.split("_")[0]) for failed_img in failed_imgs)
    print(f"Failed images: {failed_imgs}")

print("DONE")