In [4]:
import os
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
import pickle

# Constants
GETTY_URL = "<URL to scrape from>"
DOWNLOAD_DIR = "downloaded_images"
MAX_IMAGES = 5000  # Number of images to download
BATCH_SIZE = 1000  # Number of images per batch
DRIVE_FOLDER_NAME = "blonde_woman"

# Google Drive API setup
SCOPES = ['https://www.googleapis.com/auth/drive.file']
CLIENT_SECRET_FILE = 'credentials.json'
CREDENTIALS_PICKLE = 'token.pickle'

def authenticate_google_drive():
    creds = None
    if os.path.exists(CREDENTIALS_PICKLE):
        with open(CREDENTIALS_PICKLE, 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
            creds = flow.run_local_server(port=0)
        with open(CREDENTIALS_PICKLE, 'wb') as token:
            pickle.dump(creds, token)
    return build('drive', 'v3', credentials=creds)

def get_or_create_folder(service, folder_name):
    # Check if folder already exists
    query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
    results = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    items = results.get('files', [])
    if items:
        return items[0]['id']
    else:
        # Create a new folder
        file_metadata = {
            'name': folder_name,
            'mimeType': 'application/vnd.google-apps.folder'
        }
        folder = service.files().create(body=file_metadata, fields='id').execute()
        return folder.get('id')

def upload_to_drive(service, file_path, folder_id):
    file_metadata = {'name': os.path.basename(file_path), 'parents': [folder_id]}
    media = MediaFileUpload(file_path, mimetype='image/jpeg')
    file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    print(f"Uploaded {file_path} to Google Drive with file ID: {file.get('id')}")
    return file.get('id')

def download_image(src, idx):
    try:
        filename = os.path.join(DOWNLOAD_DIR, f"image_{idx+1}.jpg")
        urllib.request.urlretrieve(src, filename)
        print(f"Downloaded {filename}")
        return filename
    except Exception as e:
        print(f"Error downloading image {idx+1}: {e}")
        return None

def fetch_images(url, max_images, batch_size):
    # Set up Selenium options
    options = Options()
    options.headless = False  # Set to True if you want to run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    # Initialize WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    # Ensure download directory exists
    if not os.path.isdir(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)

    image_count = 0
    page_number = 1
    drive_service = authenticate_google_drive()
    folder_id = get_or_create_folder(drive_service, DRIVE_FOLDER_NAME)

    while image_count < max_images:
        print(f"Scraping page {page_number}...")
        # Scroll to load images
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        # Find image elements
        images = driver.find_elements(By.CSS_SELECTOR, 'img')
        print(f"Found {len(images)} images on page {page_number}")

        for image in images:
            if image_count >= max_images:
                break
            try:
                src = image.get_attribute('data-src') or image.get_attribute('src')
                if src:
                    filename = download_image(src, image_count)
                    if filename:
                        image_count += 1

                        if image_count % batch_size == 0:
                            print(f"Uploading batch of {batch_size} images to Google Drive...")
                            for filename in os.listdir(DOWNLOAD_DIR):
                                file_path = os.path.join(DOWNLOAD_DIR, filename)
                                upload_to_drive(drive_service, file_path, folder_id)
                                os.remove(file_path)  # Delete the local file after uploading
            except Exception as e:
                print(f"Error processing image {image_count}: {e}")

        if image_count < max_images:
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, '[data-testid="pagination-button-next"]')
                driver.execute_script("arguments[0].click();", next_button)  # Use JavaScript to click the button
                page_number += 1
                time.sleep(5)  # Wait for the next page to load
            except NoSuchElementException as e:
                print("No more pages or error navigating to next page:", e)
                break

    # Final batch upload
    print(f"Uploading final batch of images to Google Drive...")
    for filename in os.listdir(DOWNLOAD_DIR):
        file_path = os.path.join(DOWNLOAD_DIR, filename)
        upload_to_drive(drive_service, file_path, folder_id)
        os.remove(file_path)  # Delete the local file after uploading

    driver.quit()

def main():
    fetch_images(GETTY_URL, MAX_IMAGES, BATCH_SIZE)

if __name__ == "__main__":
    main()


Scraping page 1...
Found 62 images on page 1
Downloaded downloaded_images/image_1.jpg
Downloaded downloaded_images/image_2.jpg
Downloaded downloaded_images/image_3.jpg
Downloaded downloaded_images/image_4.jpg
Downloaded downloaded_images/image_5.jpg
Downloaded downloaded_images/image_6.jpg
Downloaded downloaded_images/image_7.jpg
Downloaded downloaded_images/image_8.jpg
Downloaded downloaded_images/image_9.jpg
Downloaded downloaded_images/image_10.jpg
Downloaded downloaded_images/image_11.jpg
Downloaded downloaded_images/image_12.jpg
Downloaded downloaded_images/image_13.jpg
Downloaded downloaded_images/image_14.jpg
Downloaded downloaded_images/image_15.jpg
Downloaded downloaded_images/image_16.jpg
Downloaded downloaded_images/image_17.jpg
Downloaded downloaded_images/image_18.jpg
Downloaded downloaded_images/image_19.jpg
Downloaded downloaded_images/image_20.jpg
Downloaded downloaded_images/image_21.jpg
Downloaded downloaded_images/image_22.jpg
Downloaded downloaded_images/image_23.jp