In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from pathlib import Path
from time import sleep
import pandas as pd

In [2]:
# filter for Taxonomy group Animalia
def filter_animals(filters):
    filter_taxonomy = filters.find_element_by_xpath("//*[text()='Taxonomy']")
    filter_taxonomy.click() # click section
    sleep(2)
    filter_animals = filters.find_element_by_xpath("//*[text()='Animalia']")
    filter_animals.click()  # click animals section
    filter_taxonomy.click() # collapse filter

### Why filter country legends ?
https://www.iucnredlist.org/resources/summary-statistics

Important note: the figures presented in Tables 5 and 6 differ from the default setting for country searches on the website. The default search includes ALL occurrences within each country (i.e., including introduced species, vagrant records, etc.). To modify country searches on the website to match the tables below, use the Country Legends filters on the Advanced Search page to show species tagged as 'Extant', 'Extant & Reintroduced', 'Extinct', 'Extinct & Reintroduced', 'Possibly Extinct', and 'Possibly Extinct & Reintroduced'.

In [3]:
# filter country legends
def filter_country_legends(filters, filter_items):
    filter_country_legends = filters.find_element_by_xpath("//*[text()='Country Legends']")
    filter_country_legends.click()  # click section
    sleep(2)
    for item in filter_items:
        element = filters.find_element_by_xpath(f"//*[text()='{item}']")
        element.click()   # click items in list
    filter_country_legends.click()  # collapse filter

In [4]:
# get all regions from section "land Regions"
# click is needed for first call
def get_regions(filters, click=False):
    land_region = filters.find_element_by_xpath("//*[text()='Land Regions']")
    if click:
        land_region.click()
    # get section (parent element)
    filter_section = land_region.find_element_by_xpath("..")
    # get all regions
    regions = filter_section.find_elements_by_class_name('filter__list__item')
    return regions

In [5]:
# get all countrys from a region
def get_countries(filters):
    filter_region = filters.find_element_by_xpath("//*[text()='Land Regions']")
    # get section (parent element)
    filter_section = filter_region.find_element_by_xpath("..")
    # get all list items
    country_items = filter_section.find_element_by_class_name('filter__list__item')
    country_items = country_items.find_elements_by_class_name('filter__list__item')
    # extract label
    countries = []
    for element in country_items[1:]:
        countries.append(element.find_element_by_tag_name('label'))
    return countries

In [6]:
# click the "Land Regions" button to return to all filters
def click_return_regions(filters):
    # select second element with name "Land Regions"
    filter_region = filters.find_elements_by_xpath("//*[text()='Land Regions']")[1]
    filter_region.click()

In [7]:
# click a filter for "Land Regions"
# offset is needed to click the arrow on the right instead of the checkbox
def click_filter(driver, element, offset=False):
    x_offset = 0
    if offset:
        x_offset = 115
    ac = ActionChains(driver)
    ac.move_to_element(element).move_by_offset(x_offset, 0).click().perform()

In [8]:
# extract species info from html li items
def extract_content(driver):
    results = []
    
    # get main html content
    main_content = driver.find_element_by_class_name('layout-page__major')
    main_html = main_content.get_attribute('innerHTML')
    
    # get html with beautifulsoup
    soup = BeautifulSoup(main_html, 'html.parser')
    species = soup.find_all('li', class_='list-results__item')
    for s in species:
        result = {}
        result['classification'] = s.contents[0].string
        result['name'] = s.contents[2].string
        result['status'] = s.contents[3].string
        result['region'] = s.contents[4].string
        result['level'] = s.contents[5].get('title')
        results.append(result)
    return pd.DataFrame(results)

In [9]:
# access IUCN webpage with selenium
DRIVER = Path('./geckodriver').absolute()
URL = 'https://www.iucnredlist.org/search/list'
driver = webdriver.Firefox(executable_path=DRIVER)
driver.get(URL)
# get filters
filters = driver.find_element_by_class_name('filter')
# filter only animals
filter_animals(filters)
# filter for country legends
country_legends_items = ['Extant & Reintroduced', 'Extinct', 'Extinct & Reintroduced', 'Possibly Extinct', 'Possibly Extinct & Reintroduced']
filter_country_legends(filters, country_legends_items)

In [10]:
results = {}
# first time with click
regions = get_regions(filters, click=True)
num_regions = len(regions)
# iterate all regions
#for i in range(num_regions):
for i in range(2): # only first to for testing
    regions = get_regions(filters)
    click_filter(driver, regions[i], offset=True)
    sleep(2)

    # iterate over all countries per region
    countries = get_countries(filters)
    num_countries = len(countries)
    #for j in range(num_countries):
    for j in range(2): # only first to for testing
        countries = get_countries(filters)
        click_filter(driver, countries[j])
        # todo: fix country name
        country_name = countries[j].text
        # extract content
        # todo: find better method to wait for results to be loaded
        sleep(10)
        # todo: there could be more results than shown on the initial page
        #       click "show more" until all is loaded or maby some other settings
        content = extract_content(driver)
        results[country_name] = content
        # uncheck country
        click_filter(driver, countries[j])
        sleep(2)

    # get back to all regions
    click_return_regions(filters)
    sleep(2)

In [11]:
results

{'Antarctica (2)': Empty DataFrame
 Columns: []
 Index: [],
 'Bouvet Island (2)':     classification                   name      status  region          level
 0  animalia — aves  Macronectes giganteus  Increasing  Global  Least Concern,
 'Anguilla (19)':               classification                  name      status  region  \
 0        animalia — mammalia    Trichechus manatus  Decreasing  Global   
 1  animalia — chondrichthyes     Pristis pectinata  Decreasing  Global   
 2        animalia — reptilia          Anolis pogus  Increasing  Global   
 3            animalia — aves       Sicalis luteola  Increasing  Global   
 4        animalia — mammalia  Mormoops blainvillei     Unknown  Global   
 5        animalia — reptilia   Leiocephalus cuneus        None  Global   
 
                    level  
 0             Vulnerable  
 1  Critically Endangered  
 2        Near Threatened  
 3          Least Concern  
 4          Least Concern  
 5                Extinct  ,
 'Antigua and Barbuda

In [12]:
results['Antigua and Barbuda (28)']

Unnamed: 0,classification,name,status,region,level
0,animalia — mammalia,Trichechus manatus,Decreasing,Global,Vulnerable
1,animalia — reptilia,Iguana delicatissima,Decreasing,Global,Critically Endangered
2,animalia — chondrichthyes,Pristis pectinata,Decreasing,Global,Critically Endangered
3,animalia — mammalia,Neomonachus tropicalis,,Global,Extinct
4,animalia — mammalia,Mormoops blainvillei,Unknown,Global,Least Concern
5,animalia — aves,Athene cunicularia,Decreasing,Global,Least Concern
6,animalia — aves,Egretta rufescens,Decreasing,Global,Near Threatened
7,animalia — reptilia,Copeoglossum redondae,,Global,Extinct
8,animalia — reptilia,Leiocephalus cuneus,,Global,Extinct
