In [1]:
import requests
from bs4 import BeautifulSoup
import os
from PIL import Image
import time

In [2]:
def scrape_vectorstock(directory, start_page, end_page):
    url = 'https://www.vectorstock.com/royalty-free-vectors/owl-sketch-vectors'
    
    print('Checking if {} exists.'.format(directory))
    if not os.path.isdir(directory):
        print('Creating {}.'.format(directory))
        os.mkdir(directory)
        print('{} created.'.format(directory))
    else:
        print('{} exists.'.format(directory))
    
    directory_tail = 'Page_{:02d}-{:02d}'.format(start_page, end_page)
    subdirectory = directory + '/' + directory_tail
    if not os.path.isdir(subdirectory):
        print('Creating subdirectory {}.'.format(subdirectory))
        os.mkdir(subdirectory)
        print('{} subdirectory created.'.format(subdirectory))
    else:
        print('Subdirectory {} exists.'.format(subdirectory))
    
    page_range = range(start_page, end_page + 1)
    
    for page in page_range:
        time.sleep(0.1)
        url_tail = '-page_{}'.format(page)
        print('Scraping page {}'.format(page))
        response = requests.get(url + url_tail)
        soup = BeautifulSoup(response.text)
        image_tags = soup.find_all('img')
        
        for index, image in enumerate(image_tags):
            time.sleep(0.1)
            print('Saving image {} of {} to {}.'.format(index + 1, len(image_tags), subdirectory))
            img = Image.open(requests.get(image['src'], stream = True).raw)
            img_name = 'VectorStock_Page_{:02d}_Image_{:03d}.{}'.format(page, index + 1, img.format)
            img.save(subdirectory + '/' + img_name)
        
    return

In [3]:
def scrape_adobe(directory, start_page, end_page, subcategory_url_string, subcategory_title):
    url = 'https://stock.adobe.com/search/images?&k=owl+{}'.format(subcategory_url_string)
    
    print('Checking if {} exists.'.format(directory))
    if not os.path.isdir(directory):
        print('Creating {}.'.format(directory))
        os.mkdir(directory)
        print('{} created.'.format(directory))
    else:
        print('{} exists.'.format(directory))
    
    directory_tail = 'Page_{:03d}-{:03d}'.format(start_page, end_page)
    subdirectory = directory + '/' + directory_tail
    if not os.path.isdir(subdirectory):
        print('Creating subdirectory {}.'.format(subdirectory))
        os.mkdir(subdirectory)
        print('{} subdirectory created.'.format(subdirectory))
    else:
        print('Subdirectory {} exists.'.format(subdirectory))
    
    page_range = range(start_page, end_page + 1)
    
    for page in page_range:
        time.sleep(0.1)
        url_tail = '&search_page={}'.format(page)
        print('Scraping page {}'.format(page))
        response = requests.get(url + url_tail)
        soup = BeautifulSoup(response.text)
        image_tags = soup.find_all('img')
        image_tags_cleaned = [x for x in image_tags if 'data-lazy' in\
                              x.attrs.keys() or x['src'].endswith('.jpg')]
        
        for index, image in enumerate(image_tags_cleaned):
            time.sleep(0.1)
            if 'data-lazy' in image.attrs.keys():
                img = Image.open(requests.get(image['data-lazy'], stream = True).raw)
            elif image['src'].endswith('.jpg'):
                img = Image.open(requests.get(image['src'], stream = True).raw)
            print('Saving image {} of {} to {}.'.format(index + 1, len(image_tags_cleaned), subdirectory))
            img_name = 'AdobeStock{}_Page_{:03d}_Image_{:03d}.{}'.format(subcategory_title, 
                                                                         page, index + 1, img.format)
            img.save(subdirectory + '/' + img_name)
        
    return

In [4]:
def scrape_fineartamerica(directory, start_page, end_page, subcategory_url_string, subcategory_title):
    url = 'https://fineartamerica.com/art/{}/owl'.format(subcategory_url_string)
    
    print('Checking if {} exists.'.format(directory))
    if not os.path.isdir(directory):
        print('Creating {}.'.format(directory))
        os.mkdir(directory)
        print('{} created.'.format(directory))
    else:
        print('{} exists.'.format(directory))
    
    directory_tail = 'Page_{:03d}-{:03d}'.format(start_page, end_page)
    subdirectory = directory + '/' + directory_tail
    if not os.path.isdir(subdirectory):
        print('Creating subdirectory {}.'.format(subdirectory))
        os.mkdir(subdirectory)
        print('{} subdirectory created.'.format(subdirectory))
    else:
        print('Subdirectory {} exists.'.format(subdirectory))
    
    page_range = range(start_page, end_page + 1)
    
    for page in page_range:
        time.sleep(0.1)
        url_tail = '?page={}'.format(page)
        print('Scraping page {}'.format(page))
        response = requests.get(url + url_tail)
        soup = BeautifulSoup(response.text)
        image_tags = soup.find_all('img')
        image_tags_cleaned = [x for x in image_tags if 'data-src' in\
                              x.attrs.keys() and 'artworkimages' in x['data-src']]
        
        for index, image in enumerate(image_tags_cleaned):
            time.sleep(0.1)
            img = Image.open(requests.get(image['data-src'], stream = True).raw)
            print('Saving image {} of {} to {}.'.format(index + 1, len(image_tags_cleaned), subdirectory))
            img_name = 'FineArtAmerica{}_Page_{:03d}_Image_{:03d}.{}'.format(subcategory_title,
                                                                             page, index + 1, img.format)
            img.save(subdirectory + '/' + img_name)
        
    return

In [5]:
# directory = '../data/raw/vectorstock'
# scrape_vectorstock(directory, 1, 21)

In [6]:
# directory = '../data/raw/vectorstock'
# scrape_vectorstock(directory, 22, 41)

In [7]:
# directory = '../data/raw/adobe_sketch'
# subcategory = 'sketch'
# subcategory_title = 'Sketch'
# scrape_adobe(directory, 1, 50, subcategory, subcategory_title)

In [8]:
# directory = '../data/raw/adobe_sketch'
# subcategory = 'sketch'
# subcategory_title = 'Sketch'
# scrape_adobe(directory, 51, 100, subcategory, subcategory_title)

In [9]:
# directory = '../data/raw/adobe_drawings'
# subcategory = 'drawing'
# subcategory_title = 'Drawings'
# scrape_adobe(directory, 1, 50, subcategory, subcategory_title)

In [10]:
# directory = '../data/raw/adobe_drawings'
# subcategory = 'drawing'
# subcategory_title = 'Drawings'
# scrape_adobe(directory, 51, 100, subcategory, subcategory_title)

In [11]:
# directory = '../data/raw/fineartamerica_drawings'
# subcategory_url = 'drawings'
# subcategory_title = 'Drawings'
# scrape_fineartamerica(directory, 1, 35, subcategory_url, subcategory_title)

In [12]:
# directory = '../data/raw/fineartamerica_digital'
# subcategory_url = 'digital+art'
# subcategory_title = 'Digital'
# scrape_fineartamerica(directory, 1, 35, subcategory_url, subcategory_title)

In [13]:
# directory = '../data/raw/fineartamerica_paintings'
# subcategory_url = 'paintings'
# subcategory_title = 'Paintings'
# scrape_fineartamerica(directory, 1, 35, subcategory_url, subcategory_title)