In [7]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from requests.exceptions import ConnectionError, ConnectTimeout
from random import randint
from GImageScraper import GoogleImageScraper
from patch import webdriver_executable
import mimetypes
from joblib import Parallel, delayed
import urllib3
from tqdm import tqdm

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
webdriver_path = "./webdriver/chromedriver.exe"
image_path = 'd://fish_scraper/'

In [3]:
def dl_image(fish, image_path, url):
    
    fileid = randint(10000, 99999)
    r = requests.get(url, allow_redirects=True, verify=False)
    content_type = r.headers['content-type']
    extension = mimetypes.guess_extension(content_type)  
    if extension == None:
        extension = ".jpg"
    if extension in [".jpg", ".png"]:
        open(f"{image_path}{fish}/{fish[:3]}{str(fileid)}{extension}", 'wb').write(r.content)

def scrape_fishbase(fish, image_path):
    
    if not os.path.exists(image_path+fish):
        os.makedirs(image_path+fish)
    
    def get_url(image):
        url = (image['src'])
        url = url.replace("%2F", "/")
        url = "https://www.fishbase.de"+url[2:]
        return url  
    
    html_page = requests.get(f'http://fishbase.de/summary/{fish}')
    soup = bs(html_page.content, 'html.parser')
    txt = soup.find(title="English")
    fish_id = re.search('[0-9]+', str(txt))[0]

    html_page = requests.get(f'https://www.fishbase.de/photos/thumbnailssummary.php?ID={fish_id}')
    soup = bs(html_page.content, 'html.parser')
    image_urls = soup.find_all("img", width="300")
    urls = [get_url(image) for image in image_urls]
    urls = list(set(urls))
    [dl_image(fish, image_path, url) for url in urls]
    print(f"Fishbase done scraping for {fish}")
    

In [4]:
def scrape_google(fish, image_path=image_path):

    number_of_images = 200
    headless = True
    min_resolution=(100,100)
    max_resolution=(1400,1200)

    image_scraper = GoogleImageScraper(webdriver_path,image_path,fish,number_of_images,headless,min_resolution,max_resolution)
    urls = image_scraper.find_image_urls()
    
    return urls

def get_files(urls):
    i = 1
    for url in urls:
        print(f"Round {i} of {len(urls)}")
        try:
            dl_image(fish, image_path, url)
        except ConnectionError as e:
            print('Ignoring Exception', e)
        i += 1

In [None]:
def get_fish(fish):
    print(fish)
    try:
        scrape_fishbase(fish, image_path)
    except ConnectTimeout as e:
        print("Fishbase unresponsive. Ignoring", e)
    urls = scrape_google(fish, image_path)
    [dl_image(fish, image_path, url) for url in urls]
    

fish_list = pd.read_csv('data/Fish_list.csv')
fish_list = list(fish_list['Taxonomy'])
fish_string = [string.replace(" ", "-") for string in fish_list]
    
Parallel(n_jobs=4)(delayed(get_fish)(fish) for fish in fish_string[:4])

In [None]:
fish_list = pd.read_csv('data/Fish_list.csv')
fish_list = list(fish_list['Taxonomy'])
fish_string = [string.replace(" ", "-") for string in fish_list]


u = []
for fish in fish_string[:3]:
    print(fish)
    scrape_fishbase(fish, image_path)
    urls = scrape_google(fish, image_path)
    u.append(urls)
    #get_files(urls)

In [None]:
for i in range(len(u)):
    print(i, len(u[i]))

In [None]:
for fish, link_urls in tqdm(zip(fish_string[:3], u), desc="Fish"):
    for url in tqdm(link_urls, desc="Links"):
        #print(url)
        dl_image(fish, image_path, url)

In [None]:
for fish, link_urls in zip(fish_string[:3], u):
    for url in link_urls:
        try:
            dl_image(fish, image_path, url)
        except ConnectionError as e:
            print('Ignoring Exception', e)

In [None]:
def store_many_hdf5(images, labels):
    """ Stores an array of images to HDF5.
        Parameters:
        ---------------
        images       images array, (N, 32, 32, 3) to be stored
        labels       labels array, (N, 1) to be stored
    """
    num_images = len(images)

    # Create a new HDF5 file
    file = h5py.File(hdf5_dir / f"{num_images}_many.h5", "w")

    # Create a dataset in the file
    dataset = file.create_dataset(
        "images", np.shape(images), h5py.h5t.STD_U8BE, data=images
    )
    meta_set = file.create_dataset(
        "meta", np.shape(labels), h5py.h5t.STD_U8BE, data=labels
    )
    file.close()

In [None]:
import re, urllib.parse
from selenium import webdriver

driver = webdriver.Chrome(executable_path=webdriver_path)

for result in driver.find_elements_by_css_selector('.js-images-link'):
    title = result.find_element_by_css_selector('.js-images-link a img').get_attribute('alt')
    link = result.find_element_by_css_selector('.js-images-link a').get_attribute('href')
    thumbnail_encoded = result.find_element_by_css_selector('.js-images-link a img').get_attribute('src')
    # https://regex101.com/r/4pgG5m/1
    match_thumbnail_urls = ''.join(re.findall(r'https\:\/\/external\-content\.duckduckgo\.com\/iu\/\?u\=(.*)&f=1', thumbnail_encoded))
    # https://www.kite.com/python/answers/how-to-decode-a-utf-8-url-in-python
    thumbnail = urllib.parse.unquote(match_thumbnail_urls).replace('&h=160', '')
    image = result.get_attribute('data-id')

    print(f'{title}\n{link}\n{thumbnail}\n{image}\n')

driver.quit()


In [None]:
#driver = webdriver.Chrome(webdriver_path, chrome_options=options)
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium import webdriver
import time

options = Options()
options.add_argument('--headless')

driver = webdriver.Chrome(webdriver_path, options=options)

indx=1
search_key = fish
image_urls = []
for i in range(1,20):
    url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
    driver.get(url)
    time.sleep(3)
    imgurl = driver.find_element(By.XPATH, value=f'//*[@id="islrg"]/div[1]/div[{i}]/a[1]/div[1]/img')
    
    imgurl.click()
    time.sleep(1)
    class_names = ["n3VNCb"]
    images = [driver.find_elements(by=By.CLASS_NAME, value=class_name) for class_name in class_names if len(driver.find_elements(by=By.CLASS_NAME, value=class_name)) != 0 ][0]
    
    for image in images:
    #only download images that starts with http
        src_link = image.get_attribute("src")
        if(("http" in  src_link) and (not "encrypted" in src_link)):
            image_urls.append(src_link)

In [None]:
#xx = driver.find_elements(by=By.CLASS_NAME, value='n3VNCb')
#src_link = xx[0].get_attribute("src")

In [None]:
fish

In [None]:
[dl_image(url, fish) for url in image_urls]

In [None]:
import mimetypes

r = requests.get("https://www.ikea.com/dk/da/images/products/jaettestor-tojdyr-elefant-gra__0877889_pe633605_s5.jpg", allow_redirects=True)
content_type = r.headers['content-type']
extension = mimetypes.guess_extension(content_type)
print(extension)

In [None]:
len(u[0])

In [None]:
len(u[1])

In [None]:
len(u[2])