In [25]:
!pip install selenium webdriver-manager requests




In [30]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")

# Use webdriver-manager to get the correct chromedriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)


In [31]:
import os
import time
import requests
from selenium.webdriver.common.by import By

def scrape_getty_images(query, target_count=150, pages=5, delay=2, download_folder="group_fight_images"):
    query_url = query.replace(" ", "+")
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    downloaded_urls = set()
    count = 0

    for page in range(1, pages + 1):
        url = f"https://www.gettyimages.com/photos/{query_url}?page={page}&sort=best"
        driver.get(url)
        time.sleep(delay)

        images = driver.find_elements(By.TAG_NAME, "img")
        for img in images:
            src = img.get_attribute("src")
            if src and "https://media.gettyimages.com/" in src and src not in downloaded_urls:
                downloaded_urls.add(src)
                try:
                    response = requests.get(src)
                    if response.status_code == 200:
                        count += 1
                        file_path = os.path.join(download_folder, f"{query.replace(' ', '_')}_{count}.jpg")
                        with open(file_path, "wb") as f:
                            f.write(response.content)
                        print(f"Downloaded {file_path}")
                    if count >= target_count:
                        print(f"Reached target of {target_count} images")
                        return list(downloaded_urls)
                except Exception as e:
                    print(f"Failed to download {src}: {e}")
    print(f"Total images downloaded: {count}")
    return list(downloaded_urls)


In [32]:
results = scrape_getty_images("group fight", target_count=150, pages=10, delay=3, download_folder="group_fight_images")
print(f"Total images downloaded: {len(results)}")


Downloaded group_fight_images/group_fight_1.jpg
Downloaded group_fight_images/group_fight_2.jpg
Downloaded group_fight_images/group_fight_3.jpg
Downloaded group_fight_images/group_fight_4.jpg
Downloaded group_fight_images/group_fight_5.jpg
Downloaded group_fight_images/group_fight_6.jpg
Downloaded group_fight_images/group_fight_7.jpg
Downloaded group_fight_images/group_fight_8.jpg
Downloaded group_fight_images/group_fight_9.jpg
Downloaded group_fight_images/group_fight_10.jpg
Downloaded group_fight_images/group_fight_11.jpg
Downloaded group_fight_images/group_fight_12.jpg
Downloaded group_fight_images/group_fight_13.jpg
Downloaded group_fight_images/group_fight_14.jpg
Downloaded group_fight_images/group_fight_15.jpg
Downloaded group_fight_images/group_fight_16.jpg
Downloaded group_fight_images/group_fight_17.jpg
Downloaded group_fight_images/group_fight_18.jpg
Downloaded group_fight_images/group_fight_19.jpg
Downloaded group_fight_images/group_fight_20.jpg
Downloaded group_fight_images

In [33]:
from google.colab import files
import shutil

# Zip the folder first
shutil.make_archive("group_fight_images", 'zip', "group_fight_images")

# Download zip
files.download("group_fight_images.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>