# Download images from Google to use them in a CNN (Multicore)

In [1]:
from multiprocessing.dummy import Pool
from selenium import webdriver
from bs4 import BeautifulSoup
from PIL import Image
import urllib.request
import pyprind
import time
import io
import os

In [2]:
url = 'https://www.google.de/search?tbm=isch&q={}'
queries = ['apple fruit', 'avocado', 'aubergine', 'banana', 'butter', 'broccoli', 'cherry', 'carrot', 'cucumber', 'durian', 'egg', 'fig', 'fennel', 'grape', 'kiwi', 'leek', 'lemon', 'mango', 'melon', 'meat', 'noodles', 'olive', 'pea', 'potato', 'rice', 'raspberry fruit', 'strawberry', 'zucchini']
os.mkdir('Images')

In [3]:
def fire(queries):
    driver = webdriver.Firefox()
    images = []
    for query in queries:
        food = query.split(' ')[0]
        driver.get(url.format(query))
        
        scroll_start = time.time()
        while time.time()-scroll_start<0:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')

        for elem in bs.find('div', {'id':'rg_s'}).findAll('img'):
            try:
                if 'src' in elem.attrs:
                    images.append([elem['src'], food])
                else:
                    images.append([elem['data-src'], food])
            except:
                pass
    driver.close()
    return images


def download(i):
    image = Image.open(io.BytesIO(urllib.request.urlopen(images[i][0]).read()))
    resized_image = image.resize((150, 100))
    resized_image.convert('RGB').save('Images/{}_{}.jpg'.format(images[i][1], i))
    bar.update()

In [4]:
n = 7
chunks = [queries[x:x+n] for x in range(0, len(queries), n)]

start1 = time.time()

pool = Pool(len(chunks))
futures = []
for query in chunks:
    futures.append(pool.apply_async(fire, [query]))
data = [img.get() for img in futures]

end1 = time.time()

In [5]:
images = [item for sublist in data for item in sublist]
print('Images: {}'.format(len(images)))

Images: 2800


In [6]:
start2 = time.time()

pool = Pool(10)
futures = []
bar = pyprind.ProgPercent(len(images))
for i in range(len(images)):
    futures.append(pool.apply_async(download, [i]))
for titel in futures:
                   _ = titel.get()
        
end2 = time.time()

[100 %] Time elapsed: 00:01:45 | ETA: 00:00:00[ 99 %] Time elapsed: 00:01:45 | ETA: 00:00:00[ 94 %] Time elapsed: 00:01:39 | ETA: 00:00:05[ 68 %] Time elapsed: 00:01:09 | ETA: 00:00:32[ 64 %] Time elapsed: 00:01:05 | ETA: 00:00:35
Total time elapsed: 00:01:45


In [7]:
print('Getting the urls')
print('Duration:\t{}s for {} images'.format((end1-start1), len(images)))
print('1 Image:\t{}s'.format((end1-start1)/len(images)))
print('1M Images:\t{}h'.format((end1-start1)/len(images)*1000000/60/60))

print('\nDownloading and resizing the images')
print('Duration:\t{}s for {} images'.format((end2-start2), len(images)))
print('1 Image:\t{}s'.format((end2-start2)/len(images)))
print('1M Images:\t{}h'.format((end2-start2)/len(images)*1000000/60/60))

Getting the urls
Duration:	54.145533084869385s for 2800 images
1 Image:	0.019337690387453352s
1M Images:	5.371580663181487h

Downloading and resizing the images
Duration:	105.33815503120422s for 2800 images
1 Image:	0.03762076965400151s
1M Images:	10.450213792778197h
