In [None]:
import os
import time
import requests
import cv2 as cv
import numpy np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def download_image(url):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            return response.content
        else:
            print(f"Failed to download {url}")
            return None
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

def detect_and_save_first_face(image_data, output_folder, file_name):
    image_array = np.frombuffer(image_data, np.uint8)
    img = cv.imdecode(image_array, cv.IMREAD_COLOR)

    if img is None:
        print(f"Error: Could not process image {file_name}.")
        return False

    classifier = cv.CascadeClassifier(cv.data.haarcascades + 'haarcascade_frontalface_default.xml')

    faces = classifier.detectMultiScale(img, scaleFactor=1.1, minNeighbors=4, minSize=(30, 30))

    if len(faces) == 0:
        print(f"No faces detected in {file_name}.")
        return False

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    (x, y, w, h) = faces[0]
    face = img[max(0, y-25):min(y+h+20, img.shape[0]), max(0, x-25):min(x+w+20, img.shape[1])]

    save_path = os.path.join(output_folder, file_name)
    cv.imwrite(save_path, face)
    print(f"{save_path} is exported with detected face.")
    return True

def scrape_and_save_images_with_faces(url, output_folder, num_pages=5):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

    for page in range(1, num_pages + 1):
        page_url = f"{url}?page={page}"
        driver.get(page_url)
        
        time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        img_tags = soup.find_all('img')

        for img in img_tags:
            img_url = img.get('src')
            if not img_url:
                continue

            img_url = urljoin(url, img_url)

            img_name = os.path.basename(img_url).split("?")[0]

            image_data = download_image(img_url)
            if image_data is None:
                continue

            detect_and_save_first_face(image_data, output_folder, img_name)
        
        print(f"Completed processing page {page}")
        print('-*-'*20)
        print('\n')
        print('-*-'*20)
    
    driver.quit()

def main():
    webpage_url = 'https://www.gettyimages.in/photos/akshay-kumar-photo'
    output_folder = 'Extracted_faces' + ' ' + str(webpage_url.split('/')[-1])
    num_pages_to_scrape = 30

    scrape_and_save_images_with_faces(webpage_url, output_folder, num_pages=num_pages_to_scrape)

if __name__ == "__main__":
    main()


Failed to download https://www.gettyimages.in/components/global-nav/static/static/GettyHeaderLogo-4c344fa4f9e47c257bea.svg
Failed to download https://www.gettyimages.in/components/global-nav/static/static/UnsplashForBrands-00c7af5aed68b4b7f3f3.svg
