In [25]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from pathlib import Path
from time import sleep
import pandas as pd

In [26]:
# filter for Taxonomy group Animalia
def filter_animals(filters):
    filter_taxonomy = filters.find_element_by_xpath("//*[text()='Taxonomy']")
    filter_taxonomy.click() # click section
    sleep(2)
    filter_animals = filters.find_element_by_xpath("//*[text()='Animalia']")
    filter_animals.click()  # click animals section
    filter_taxonomy.click() # collapse filter

### Why filter country legends ?
https://www.iucnredlist.org/resources/summary-statistics

Important note: the figures presented in Tables 5 and 6 differ from the default setting for country searches on the website. The default search includes ALL occurrences within each country (i.e., including introduced species, vagrant records, etc.). To modify country searches on the website to match the tables below, use the Country Legends filters on the Advanced Search page to show species tagged as 'Extant', 'Extant & Reintroduced', 'Extinct', 'Extinct & Reintroduced', 'Possibly Extinct', and 'Possibly Extinct & Reintroduced'.

In [27]:
# filter country legends
def filter_country_legends(filters, filter_items):
    filter_country_legends = filters.find_element_by_xpath("//*[text()='Country Legends']")
    filter_country_legends.click()  # click section
    sleep(2)
    for item in filter_items:
        element = filters.find_element_by_xpath(f"//*[text()='{item}']")
        element.click()   # click items in list
    filter_country_legends.click()  # collapse filter

In [28]:
# get all regions from section "land Regions"
# click is needed for first call
def get_regions(filters, click=False):
    land_region = filters.find_element_by_xpath("//*[text()='Land Regions']")
    if click:
        land_region.click()
    # get section (parent element)
    filter_section = land_region.find_element_by_xpath("..")
    # get all regions
    regions = filter_section.find_elements_by_class_name('filter__list__item')
    return regions

In [29]:
# get all countrys from a region
def get_countries(filters):
    filter_region = filters.find_element_by_xpath("//*[text()='Land Regions']")
    # get section (parent element)
    filter_section = filter_region.find_element_by_xpath("..")
    # get all list items
    country_items = filter_section.find_element_by_class_name('filter__list__item')
    country_items = country_items.find_elements_by_class_name('filter__list__item')
    # extract label
    countries = []
    for element in country_items[1:]:
        countries.append(element.find_element_by_tag_name('label'))
    return countries

In [30]:
# click the "Land Regions" button to return to all filters
def click_return_regions(filters):
    # select second element with name "Land Regions"
    filter_region = filters.find_elements_by_xpath("//*[text()='Land Regions']")[1]
    filter_region.click()

In [31]:
# click a filter for "Land Regions"
# offset is needed to click the arrow on the right instead of the checkbox
def click_filter(driver, element, offset=False):
    x_offset = 0
    if offset:
        x_offset = 115
    ac = ActionChains(driver)
    ac.move_to_element(element).move_by_offset(x_offset, 0).click().perform()

In [32]:
# clickst the "show all" button on the bottom of the main content as long as all species are loaded
def load_all(driver):
    main_content = driver.find_element_by_class_name('layout-page__major')
    try:
        show_all_button = main_content.find_element_by_class_name('section__link-out')
        show_all_button.click()
        # todo: find better method to wait for loaded content
        sleep(10)
        load_all(driver) # recursive call
    except:
        pass

In [33]:
# extract species info from html li items
def extract_content(driver):
    results = []
    
    # get main html content
    main_content = driver.find_element_by_class_name('layout-page__major')
    main_html = main_content.get_attribute('innerHTML')
    
    # get html with beautifulsoup
    soup = BeautifulSoup(main_html, 'html.parser')
    items = soup.find_all('li', class_='list-results__item')
    for item in items:
        result = {}
        result['kingdom_class'] = item.contents[0].string
        result['common_name'] = item.contents[1].text
        result['scientific_name'] = item.contents[2].text
        result['trend'] = item.contents[3].text
        result['region'] = item.contents[4].text
        result['threat_level'] = item.contents[5].get('title')
        results.append(result)
    return pd.DataFrame(results)

In [34]:
DRIVER = Path('./geckodriver').absolute()
URL = 'https://www.iucnredlist.org/search/list'
OUTPUT_PATH = Path('./data/IUCN/scraped')

# connect to browser with selenium
driver = webdriver.Firefox(executable_path=DRIVER)
driver.get(URL)
# get filters
filters = driver.find_element_by_class_name('filter')
# filter only animals
filter_animals(filters)
# filter for country legends
country_legends_items = ['Extant & Reintroduced', 'Extinct', 'Extinct & Reintroduced', 'Possibly Extinct', 'Possibly Extinct & Reintroduced']
filter_country_legends(filters, country_legends_items)

In [35]:
OUTPUT_PATH = Path('./data/IUCN/scraped')
print(OUTPUT_PATH)

data/IUCN/scraped


In [36]:
# first time with click
regions = get_regions(filters, click=True)
num_regions = len(regions)
# iterate all regions
for i in range(num_regions):
    regions = get_regions(filters)
    click_filter(driver, regions[i], offset=True)
    sleep(2)

    # iterate over all countries per region
    countries = get_countries(filters)
    num_countries = len(countries)
    for j in range(num_countries):
        countries = get_countries(filters)
        click_filter(driver, countries[j])
        # todo: fix country name
        country_name = countries[j].text.split('(')[0].rstrip()
        # load the whole content
        load_all(driver)
        # extract content
        # todo: find better method to wait for results to be loaded
        sleep(10)
        content = extract_content(driver)
        # save to csv
        file_path = (OUTPUT_PATH / country_name.replace(' ', '_')).with_suffix('.csv')
        content.to_csv(file_path, index=False)
        # uncheck country
        click_filter(driver, countries[j])
        sleep(2)

    # get back to all regions
    click_return_regions(filters)
    sleep(2)