In [None]:
import os
import time
import subprocess
import random
import requests
import pyautogui

import undetected_chromedriver as uc

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException

HEADERS = {
    "Referer": "https://www.pixiv.net/",
    "User-Agent": "Mozilla/5.0"
}

# Helper functions

In [None]:
def scroll_down(driver, n_times):
    """
    Scrolls down the current page using the Selenium WebDriver.

    Parameters:
        driver : selenium.webdriver
            Selenium WebDriver instance currently viewing the page.
        n_times : int
            The number of times to scroll down (each scroll is 1000 pixels).
    """
    actions = ActionChains(driver)
    
    for _ in range(n_times):
        actions.scroll_by_amount(0, 5000).perform()  # effectively scroll down to the end of the page, 
        time.sleep(0.3)

def login(driver, email, password):
    """
    Logs in to Pixiv using the provided credentials.

    Parameters:
        driver : selenium.webdriver
            Selenium WebDriver instance currently on the Pixiv login page.
        email : string
            The email address or Pixiv ID used to log in.
        password : string
            The password corresponding to the given email.
    """
    email_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder='E-mail address or pixiv ID']")
    email_input.send_keys(email)
    time.sleep(1)
    password_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder='Password']")
    password_input.send_keys(password)
    time.sleep(1)
    password_input.send_keys(Keys.ENTER)

def read_account_info(path):
    """
    Reads account login information from a file.

    Parameters:
        path : string
            The file path to a text file containing login information in key=value format.

    Returns:
        dict
            A dictionary where each key is the left-hand side of a line split by "=",
            and the corresponding value is the right-hand side (RHS)
    """
    info = {}
    
    with open(path) as file:
        lines = file.readlines()

        for line in lines:
            temp = line.split("=")
            key = temp[0].strip()
            value = temp[1].strip()

            info[key] = value

    return info

In [None]:
def get_artwork_ids(driver):
    """
    Extracts artwork IDs from the current Pixiv page using the Selenium WebDriver.

    Parameters:
        driver : selenium.webdriver
            Selenium WebDriver instance currently pointing to a Pixiv page with artworks.

    Returns:
        list (string)
            A list of artwork IDs extracted from the page.
    """
    ids = []

    artworks = driver.find_elements(By.XPATH, "//a[contains(@href, '/en/artworks/')]")
    
    for artwork in artworks:
        id = artwork.get_attribute("data-gtm-value")
        if id is not None:
            ids.append(int(id))

    return ids
    
def get_img_urls_from_artwork_id(driver, artwork_id, has_gui=False):
    """
    Retrieves all image URLs from a Pixiv artwork page given its artwork ID.

    Parameters:
        driver : selenium.webdriver
            Selenium WebDriver instance used to navigate and scrape the page.
        artwork_id : int or str
            The ID of the artwork to retrieve image URLs from.
        has_gui : bool, default=False
            Not implemented, indicates if the driver is running with a GUI.

    Returns:
        tuple (bool, list)
            Bool indicates images found (True) or animated/ugoira (False), and list contains the URLs of the images.
    """
    artwork_url = f"https://www.pixiv.net/en/artworks/{artwork_id}"
    img_urls = []
    
    # show_all_class = "sc-e1dc2ae6-1 fUQgzA"
    # reading_works_class = "sc-e1dc2ae6-1 fUQgzA" #"sc-13c1e204-0 ixmPpS"
    img_class = "sc-e1dc2ae6-1 fUQgzA"
    has_multiple_pages = True

    driver.get(artwork_url)

    time.sleep(2)

    # find animated/ugoira. we don't download this
    try:
        canvas = driver.find_element(By.TAG_NAME, "canvas")
        return False, []
    except NoSuchElementException:
        pass

    # set correct <img> class
    try:
        div = driver.find_element(By.XPATH, '//div[@class="sc-9222a8f6-2 kufPoS"]')
        # if div.text == "Reading works":
        #     img_class = reading_works_class
        # elif div.text == "Show all":
        #     img_class = show_all_class
    except NoSuchElementException:
        # img_class = show_all_class
        has_multiple_pages = False
    
    # print(f"img_class: {img_class}")
    
    # find number of pages
    if has_multiple_pages:
        n_pages_div = driver.find_elements(By.XPATH, '//div[@class="sc-b5e6ab10-0 krtyqW"]')[0]
        span = n_pages_div.find_element(By.TAG_NAME, 'span')
        n_pages = int(span.text.strip().split("/")[-1])
    else:
        n_pages = 1
    
    # print(f"n_pages: {n_pages}")

    # get image base URL
    img_element = driver.find_element(By.XPATH, f'//img[(@class="{img_class}")]')
    base_img_url = img_element.get_attribute("src").split("_")[0]
    
    for i in range(n_pages):
        img_url = base_img_url + f"_p{i}_master1200.jpg"
        img_urls.append(img_url)

    return True, img_urls

def get_img_and_pg_num(img_url):
    """
    Extracts the artwork ID and page number from a Pixiv image URL.

    Parameters:
        img_url : string
            The direct URL of the Pixiv image.

    Returns:
        tuple (int, int)
            Artwork's ID and page number integers extracted from the URL.
    """
    # image's url is in format:
    # https://i.pximg.net/img-master/img/{yyyy}/{mm}/{dd}/{hh}/{mm}/{ss}/{artwork_id}_p{page_number}_master1200.jpg

    img_info = img_url.split("/")[-1]
    img_info = img_info.split("_")

    artwork_id = int(img_info[0])
    page_number = int(img_info[1][1:])
    
    return artwork_id, page_number

def download_img(save_path, url, timeout=10):
    """
    Downloads an image from a given URL and saves it to the specified path.

    Parameters:
        save_path : string
            The file path where the image will be saved.
        url : string
            The URL of the image.
        timeout : int, default=10
            The maximum seconds to wait for a response.

    Returns:
        bool
            True if the download is successful, False otherwise.
    """
    try:
        response = requests.get(url, headers=HEADERS, timeout=timeout)

        if response.status_code != 200:
            return False
            
        with open(save_path, "wb") as f:
            f.write(response.content)
            return True
    except requests.exceptions.RequestException as e:
        return False

# Crawl

In [None]:
save_path = "C:/Users/PC MY TU/Desktop/temp_artworks"

account_info = read_account_info("account.txt")
email = account_info["email"]
password = account_info["password"]
nickname = account_info["nickname"]

### Collect image URLs

In [None]:
artist_id = 

In [None]:
current_page = 1

# construct artist page
artist_page = "https://www.pixiv.net/en/users/artist_id/artworks/?p=current_page"
artist_page = artist_page.replace("artist_id", str(artist_id))
current_artist_page = artist_page.replace("current_page", str(current_page))

artwork_ids = []

# set up Chrome WebDriver
driver = uc.Chrome(use_subprocess=False)
driver.maximize_window()

login_page = "https://accounts.pixiv.net/login"
driver.get(login_page)

# log in
print("Log in ... START")
# find email and password field
login(driver, email, password)
print("Log in ... DONE")

time.sleep(2)

# find all artwork's ids
print("Find all artwork's ids ... START")
has_next_page = True

while has_next_page:
    print(f"Page: {current_page}")
    
    driver.get(current_artist_page)
    time.sleep(1)
    scroll_down(driver, n_times=10)
    
    current_page_artwork_ids = get_artwork_ids(driver)

    print(f"Found {len(current_page_artwork_ids)} artworks: {current_page_artwork_ids}")

    artwork_ids.extend(current_page_artwork_ids)

    # check if next page exists
    # find previous and next button
    btns = driver.find_elements(By.XPATH, '//a[contains(@class, "sc-ddbdb82a-2 jnCvtc sc-ddbdb82a-1")]')
    hidden_attr = btns[-1].get_attribute("hidden")
    # True if no hidden attribute (next page is available)
    has_next_page = (hidden_attr is None)

    current_page += 1
    current_artist_page = artist_page.replace("current_page", str(current_page))
    
    time.sleep(1)
    
print("Find all artwork's ids ... DONE\nCollecting image URLs")

# collect image urls
animated_artwork_ids = []
img_urls = []

for artwork_id in artwork_ids:
    is_image, artwork_img_urls = get_img_urls_from_artwork_id(driver, artwork_id)

    if is_image:
        img_urls.extend(artwork_img_urls)
    else:
        animated_artwork_ids.append(artwork_id)

driver.quit()

# print crawled information
print(f"Found a total of {len(artwork_ids)} artworks, which includes:")

print(f"\t{len(animated_artwork_ids)} animated/uroiga artworks:")
if len(animated_artwork_ids) > 0:
    n_ids_per_row = 5
    for i in range(0, len(animated_artwork_ids), n_ids_per_row):
        for j in range(i, min(i + n_ids_per_row, len(animated_artwork_ids))):
            row = animated_artwork_ids[i:i + n_ids_per_row]
            row_str = ", ".join(str(_id) for _id in row)
        print(f"\t\t{row_str}")

print(f"\t{len(img_urls)} image URLs:")
for url in img_urls:
    print("\t", "\t", url)

### Download

In [None]:
failed_download_urls = []

os.makedirs(save_path, exist_ok=True)

for i, url in enumerate(img_urls):
    artwork_id, page_number = get_img_and_pg_num(url)
    file_name = f"{artwork_id}_p{page_number}.jpg"
    file_path = os.path.join(save_path, file_name)

    print(f"[{i+1}/{len(img_urls)}], downloading {file_name}\t\t", end="")

    is_success = download_img(file_path, url, timeout=10)
    
    if is_success:
        print("ok")
    else:
        print("FAILED")
        failed_download_urls.append(url)

    time.sleep(random.choice([0, 0.5]))

print("DOWNLOADING DONE")

if failed_download_urls:
    print("\nFAILED:\n")

    for url in failed_download_urls:
        print(url)