In [2]:
import os
import re
import requests
import numpy as np

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from PIL import Image

from tqdm import tqdm


In [51]:
class URLCrawler():
    def __init__(self, categories) -> None:
        if type(categories) !=  list:
            self.categories = [categories]
            print(self.categories)
        else:
            self.categories = categories
        self.path = './Data/'
        self.makeDirectory()

 
    def checkURL(self, requested_url):
        if not urlparse(requested_url).scheme:
            requested_url = "https://" + requested_url
        return requested_url


    def requestAndParse(self, requested_url):
        requested_url = self.checkURL(requested_url)
        try:
            # define headers to be provided for request authentication
            headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 
                            'AppleWebKit/537.11 (KHTML, like Gecko) '
                            'Chrome/23.0.1271.64 Safari/537.11',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding': 'none',
                'Accept-Language': 'en-US,en;q=0.8',
                'Connection': 'keep-alive'}
            request_obj = Request(url=requested_url, headers=headers)
            page_html = urlopen(request_obj)
            page_soup = BeautifulSoup(page_html, "html.parser")
            return page_soup

        except Exception as e:
            print(e)

    def crawlImageURL(self, soupObject):
        collection = []
        soup = soupObject
        # Find all tag 'img' to find images. 
        for img in soup.find_all('img'):
            # Take the url of each image. 
            collection.append(img.attrs['src'])
        collection = set(collection) #Use set to remove duplicate values.
        return list(collection)

    def makeDirectory(self):
        for name in self.categories:
            path = os.path.join(self.path, name)
            if not os.path.exists(path): 
                os.mkdir(path)
                print('{} IS CREATED!!!'.format(name))
            else:
                print('{} IS EXIST!!!'.format(path))

    def downloadImgWithURL(self, urlCollection, category):

        for url in urlCollection:
            indexes = re.split('/|\?', url)
            for idx in indexes:
                # If the image has its name on the url, use that name, else, use the format below. 
                if re.search('\d\.(jpg|png|jpeg)', idx):
                    filename = idx
                else:
                    filename = category + '_' + str(hash(np.random.randint(0, 100000))) + '_' + str(hash(np.random.randint(0, 100000))) + '.jpg'
                    # filename = idx[-2] + '.jpg'
                    
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                # response.raw.decode_content = True
                open(self.path + category + '/' + filename, 'wb').write(response.content)

    def download(self, website):
        urlList = []
        if type(website) != list:
            urlList.append(website)
        else:
            urlList = website

        for web in website:
            print('CRAWLING AT ' + web)
            for category in tqdm(self.categories):
                print('CRAWLING IMAGE OF ' + category.upper())
                soupObject = self.requestAndParse(web + category + '/')
                imgCollection = self.crawlImageURL(soupObject)
                self.downloadImgWithURL(imgCollection, category)



In [54]:
url_resources = ['https://www.pexels.com/search/', 'https://unsplash.com/s/photos/']

animal_list = ['cat', 'lion', 'leopard', 'tiger', 'jaguar', 'sphynx', 
                'dog', 'wolf', 'husky', 'corgi', 'pug']

crawling = URLCrawler(animal_list)
crawling.download(url_resources)

cat IS CREATED!!!
lion IS CREATED!!!
leopard IS CREATED!!!
tiger IS CREATED!!!
jaguar IS CREATED!!!
sphynx IS CREATED!!!
dog IS CREATED!!!
wolf IS CREATED!!!
husky IS CREATED!!!
corgi IS CREATED!!!
pug IS CREATED!!!
CRAWILING AT https://www.pexels.com/search/


  0%|          | 0/11 [00:00<?, ?it/s]

CRAWLING IMAGE OF CAT


  9%|▉         | 1/11 [00:10<01:45, 10.55s/it]

CRAWLING IMAGE OF LION


 18%|█▊        | 2/11 [00:17<01:14,  8.30s/it]

CRAWLING IMAGE OF LEOPARD


 27%|██▋       | 3/11 [00:24<01:03,  7.98s/it]

CRAWLING IMAGE OF TIGER


 36%|███▋      | 4/11 [00:35<01:03,  9.01s/it]

CRAWLING IMAGE OF JAGUAR


 45%|████▌     | 5/11 [00:42<00:50,  8.36s/it]

CRAWLING IMAGE OF SPHYNX


 55%|█████▍    | 6/11 [00:49<00:39,  7.94s/it]

CRAWLING IMAGE OF DOG


 64%|██████▎   | 7/11 [00:59<00:34,  8.56s/it]

CRAWLING IMAGE OF WOLF


 73%|███████▎  | 8/11 [01:09<00:26,  8.90s/it]

CRAWLING IMAGE OF HUSKY


 82%|████████▏ | 9/11 [01:15<00:16,  8.19s/it]

CRAWLING IMAGE OF CORGI


 91%|█████████ | 10/11 [01:22<00:07,  7.76s/it]

CRAWLING IMAGE OF PUG


100%|██████████| 11/11 [01:30<00:00,  8.21s/it]


CRAWILING AT https://unsplash.com/s/photos/


  0%|          | 0/11 [00:00<?, ?it/s]

CRAWLING IMAGE OF CAT


  9%|▉         | 1/11 [00:16<02:43, 16.37s/it]

CRAWLING IMAGE OF LION


 18%|█▊        | 2/11 [00:44<03:28, 23.12s/it]

CRAWLING IMAGE OF LEOPARD


 27%|██▋       | 3/11 [01:15<03:34, 26.81s/it]

CRAWLING IMAGE OF TIGER


 36%|███▋      | 4/11 [01:38<02:58, 25.48s/it]

CRAWLING IMAGE OF JAGUAR


 45%|████▌     | 5/11 [02:06<02:37, 26.32s/it]

CRAWLING IMAGE OF SPHYNX


 55%|█████▍    | 6/11 [02:42<02:27, 29.51s/it]

CRAWLING IMAGE OF DOG


 64%|██████▎   | 7/11 [03:07<01:52, 28.21s/it]

CRAWLING IMAGE OF WOLF


 73%|███████▎  | 8/11 [03:37<01:25, 28.51s/it]

CRAWLING IMAGE OF HUSKY


 82%|████████▏ | 9/11 [04:02<00:55, 27.60s/it]

CRAWLING IMAGE OF CORGI


 91%|█████████ | 10/11 [04:29<00:27, 27.47s/it]

CRAWLING IMAGE OF PUG


100%|██████████| 11/11 [05:00<00:00, 27.35s/it]
