In [2]:
import argparse
import hashlib
import imghdr
from os import makedirs
from os.path import exists, join, splitext
import pickle
import posixpath
import re
import signal
import socket
import threading
import time
from urllib.parse import quote, quote_plus, urlsplit, urlunsplit
from urllib.request import Request, urlopen
from io import BytesIO
import random
import sys
import pickle
import uuid

### Config

In [3]:
socket.setdefaulttimeout(2)

To manage concurrent processes, we need to synchronize these variables in all functions, so we define them as global variables.


In [4]:
output_dir = './Crawled_Imgaes'  # default output dir
seen_links = []
md5_of_images = {}
in_progress_threads = 0

The key is 'User-Agent' and its value is a string that
    represents the user agent for a web browser.

In [5]:
url_header = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
}

## Encoding URL

    URL encoding helps to ensure that URLs 
    are properly formatted and can be correctly 
    interpreted by web servers and browsers

In [6]:
def url_encoder(url):
    scheme, netloc, path, query, fragment = list(urlsplit(url))
    
    path = quote(path)
    query = quote_plus(path)
    fragment = quote(fragment)
    
    return urlunsplit((shceme, netloc, path, query, fragment))

In [7]:
def download(number_sema: threading.Semaphore, resource_sema: threading.Semaphore, url: str, output_dir: str, limit: int):
    global seen_links
    global md5_of_images
    global in_progress_threads
    global url_header

    if url in seen_links:
        # print('SKIP: Already checked url, skipping')
        return

    # Acquire the lock for the number_sema semaphore to limit the number of concurrent downloads
    number_sema.acquire()

    in_progress_threads += 1
    
    # Initialize a flag to keep track of whether the resource_sema semaphore has been acquired or not
    acquired_resource_sema = False

    path = urlsplit(url).path

    # Extracts the filename and extension from the URL path
    name, ext = splitext(posixpath.basename(path))
    
    # If there is no file name in the URL path, generate a random unique identifier to use as the file name
    if not name:
        # if path and name are empty (e.g. https://sample.domain/abcd/?query)
        name = str(uuid.uuid4())
    else:
        # Remove any leading or trailing whitespace characters from the file name
        name = name.strip()

    """
     instead of going to 
                         "www.ñandu.com", 
     we go to 
                         "www.nandu.com" 
     instead, by the following code:
    """  
    try:
        url.encode('ascii')
    except UnicodeEncodeError:  # the url contains non-ascii characters
        url = url_encoder(url)

     

    try:
        request = Request(url, None, url_header)
        image = urlopen(request).read()
        
        # Identify the type of the image stored as bytes in a BytesIO object using imghdr module
        # and assign the detected image type to the image_type variable.
        imgtype = imghdr.what(BytesIO(image), image)
        
        if not imgtype:
            # print('SKIP: Invalid image, not saving ' + name)
            return

        # Attach a file extension based on an image header
        if not ext:
            ext = 'jpg' if imgtype == 'jpeg' else imgtype

        filename = name + '.' + ext

        if hashlib.md5(image).hexdigest() in md5_of_images:
            # print('SKIP: Image is a duplicate, not saving ' + filename)
            return

        
        # Check if file with same name already exists in output directory
        i = 0
        while exists(join(output_dir, filename)):
            # If file exists, increment index and update filename
            i += 1
            filename = str(name) + "-" + str(index) + "." + str(ext)

        # Updating the dictionary and adding name of the file which is created
        md5_of_images[hashlib.md5(image).hexdigest()] = filename

        resource_sema.acquire()
        acquired_resource_sema = True

        # Stop crawling if we achieved the number of images we wanted.
        # and also stop crawling if there is no limit for crawling.
        if limit is not None and len(seen_links) >= limit:
            return

        with open(join(output_dir, filename), 'wb') as file:
            file.write(image)

        print(" OK : " + filename)
        seen_links.append(url)
    except Exception as e:
        print("FAIL: " + name, str(e))
    finally:
        number_sema.release()
        if acquired_resource_sema:
            resource_sema.release()
        in_progress_threads -= 1

In [8]:

def images_searched_by_keyword(number_sema: threading.Semaphore, resource_sema: threading.Semaphore, keyword: str,
                              output_dir: str, filters: str, limit: int):
    global seen_links
    global md5_of_images
    global in_progress_threads
    global url_header
    current = 1
    last = ''
    while True:
        time.sleep(0.1)

        request_url = 'https://www.bing.com/images/async?q=' + quote_plus(keyword) + '&first=' + str(
            current) + '&count=35&qft=' + ('' if filters is None else filters)
        request = Request(request_url, None, headers=url_header)
        response = urlopen(request)
        html = response.read().decode('utf8')
        links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)
        try:
            if links[-1] == last:
                return
            for index, link in enumerate(links):
                if limit is not None and len(seen_links) >= limit:
                    exit(0)
                t = threading.Thread(target=download, args=(number_sema, resource_sema, link, output_dir, limit))
                t.start()
                current += 1
            last = links[-1]
        except IndexError:
            print('FAIL: No search results for "{0}"'.format(keyword))
            return

In [9]:
def backup_history(*args):
    global output_dir
    global seen_links
    global md5_of_images
    global in_progress_threads
    global url_header
    download_history = open(join(output_dir, 'download_history.pickle'), 'wb')
    pickle.dump(seen_links, download_history)
    copied_md5_of_images = dict(
        md5_of_images)  # We are working with the copy, because length of input variable for pickle must not be changed during dumping
    pickle.dump(copied_md5_of_images, download_history)
    download_history.close()
    print('history_dumped')
    if args:
        exit(0)

In [10]:
def main():
    global output_dir
    global seen_links
    global md5_of_images
    global url_header
    
    # Define default values
    number_of_threads = 20
    large_image_filter = "+filterui:imagesize-large"
    filters = ''
    limit = None
    
    # Get input values from user
    search_string = input("Enter the sentence you want to have that images: ")
    
    # Set up file paths and directories
    if not exists(output_dir):
        makedirs(output_dir)
    output_sub_dir = join(output_dir, search_string.strip().replace(' ', '_'))
    if not exists(output_sub_dir):
        makedirs(output_sub_dir)
        
    # Load previous download history
    try:
        download_history = open(join(output_dir, 'download_history.pickle'), 'rb')
        seen_links = pickle.load(download_history)
        md5_of_images = pickle.load(download_history)
        download_history.close()
    except (OSError, IOError):
        seen_links = []
        
    # Set up semaphores for threading
    number_sema = threading.BoundedSemaphore(number_of_threads)
    resource_sema = threading.Semaphore()
    
    # Search for images using the specified keyword and filters
    images_searched_by_keyword(number_sema, resource_sema, search_string, output_sub_dir, filters, limit)
    
    # Save download history
    backup_history()

In [11]:
if __name__ == "__main__":
    main()

Enter the sentence you want to have that images: dog
 OK : Dog-Pictures..jpg
 OK : Australian-Cattle-Dog-Photo1..jpg
 OK : 06E96929-A7D8-4892-B2FE-721C9843B91B..jpeg
FAIL: mE2slJVUHtmPqV6cK8X3P6Kb_qcxfSuSw5KDoPJLfo4 HTTP Error 403: Forbidden
 OK : 04-dog-breeds-dalmation..jpg
 OK : West_Highland_White_Terrier_xd00of..jpeg
 OK : grey-6..jpg
 OK : scientists-have-figured-out-how-dogs-make-us-fall-in-love-with-them..jpg
 OK : 769157..jpg
 OK : dogs-044..jpg
 OK : beagle-RolfKopfle-Photolibrary-Getty-135631212-56a26b1d3df78cf772756667..jpg
 OK : 99a8a89f-fd3b-4a4c-8258-c187850a455a-GettyImages-516766620..jpg
 OK : RQbVOd..jpg
 OK : dog-1463218026uIC..jpg
 OK : family+dogs-473268..jpeg
 OK : large-breeds-4-1024x768..jpg
 OK : cute-dog-picture-039-06..jpg
 OK : Of-course-dogs-smile..jpg
 OK : dog-nose..jpg
 OK : yo..jpg
 OK : KbyzmXa..jpg
 OK : embark-feline-friendly-dogs-4-931x1024..jpg
 OK : Pomeranian-Dog-Breed..png
 OK : 630f0ef3f6f3126ca11f19f4a9b85243..jpg
 OK : types-of-retriever-dogs