In [65]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from pathlib import Path
from time import sleep
import pandas as pd

In [66]:
# filter for Taxonomy group Animalia
def filter_animals(filters):
    filter_taxonomy = filters.find_element_by_xpath("//*[text()='Taxonomy']")
    filter_taxonomy.click() # click section
    sleep(2)
    filter_animals = filters.find_element_by_xpath("//*[text()='Animalia']")
    filter_animals.click()  # click animals section
    filter_taxonomy.click() # collapse filter

### Why filter country legends ?
https://www.iucnredlist.org/resources/summary-statistics

Important note: the figures presented in Tables 5 and 6 differ from the default setting for country searches on the website. The default search includes ALL occurrences within each country (i.e., including introduced species, vagrant records, etc.). To modify country searches on the website to match the tables below, use the Country Legends filters on the Advanced Search page to show species tagged as 'Extant', 'Extant & Reintroduced', 'Extinct', 'Extinct & Reintroduced', 'Possibly Extinct', and 'Possibly Extinct & Reintroduced'.

In [67]:
# filter country legends
def filter_country_legends(filters, filter_items):
    filter_country_legends = filters.find_element_by_xpath("//*[text()='Country Legends']")
    filter_country_legends.click()  # click section
    sleep(2)
    for item in filter_items:
        element = filters.find_element_by_xpath(f"//*[text()='{item}']")
        element.click()   # click items in list
    filter_country_legends.click()  # collapse filter

In [68]:
# click filter for a country
def click_country_filter(filters, country_name):
    filter_country = filters.find_element_by_xpath(f"//*[text()='{country_name}']")
    filter_country.click()
    sleep(2)

In [69]:
# click the "Land Regions" button to return to all filters
def click_return_regions(filters):
    # select second element with name "Land Regions"
    filter_region = filters.find_elements_by_xpath("//*[text()='Land Regions']")[1]
    filter_region.click()

In [70]:
# click a filter for "Land Regions"
# offset is needed to click the arrow on the right instead of the checkbox
def click_filter(driver, element, offset=False):
    x_offset = 0
    if offset:
        x_offset = 115
    ac = ActionChains(driver)
    ac.move_to_element(element).move_by_offset(x_offset, 0).click().perform()

In [71]:
# clickst the "show all" button on the bottom of the main content as long as all species are loaded
def load_all(driver):
    main_content = driver.find_element_by_class_name('layout-page__major')
    try:
        show_all_button = main_content.find_element_by_class_name('section__link-out')
        show_all_button.click()
        # todo: find better method to wait for loaded content
        sleep(10)
        load_next(driver) # recursive call
    except:
        pass

In [72]:
# extract species info from html li items
def extract_content(driver):
    results = []
    
    # get main html content
    main_content = driver.find_element_by_class_name('layout-page__major')
    main_html = main_content.get_attribute('innerHTML')
    
    # get html with beautifulsoup
    soup = BeautifulSoup(main_html, 'html.parser')
    items = soup.find_all('li', class_='list-results__item')
    for item in items:
        result = {}
        result['kingdom_class'] = item.contents[0].string
        result['common_name'] = item.contents[1].text
        result['scientific_name'] = item.contents[2].text
        result['trend'] = item.contents[3].text
        result['region'] = item.contents[4].text
        result['threat_level'] = item.contents[5].get('title')
        results.append(result)
    return pd.DataFrame(results)

In [76]:
DRIVER = Path('./geckodriver').absolute()
URL = 'https://www.iucnredlist.org/search/list'
OUTPUT_PATH = Path('./data/IUCN/scraped')
OECD_COUNTRIES = {
#    'Europe': [
#        'Austria', 'Belgium', 'Czechia', 'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
#        'Iceland', 'Ireland', 'Italy', 'Luxembourg', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Slovakia', 'Spain',
#        'Sweden', 'Switzerland', 'United Kingdom'
#    ],
    'Oceania': [
        'Australia', 'New Zealand',
    ],
    'North America': [
        'Canada', 'United States',
    ],
    'East Asia': [
        'Japan', 'Korea, Republic of',
    ],
    'Mesoamerica': [
        'Mexico',
    ],
}

# connect to browser with selenium
driver = webdriver.Firefox(executable_path=DRIVER)
driver.get(URL)
# get filters
filters = driver.find_element_by_class_name('filter')
# filter only animals
filter_animals(filters)
# filter for country legends
country_legends_items = ['Extant & Reintroduced', 'Extinct', 'Extinct & Reintroduced', 'Possibly Extinct', 'Possibly Extinct & Reintroduced']
filter_country_legends(filters, country_legends_items)

In [None]:
for r in OECD_COUNTRIES.keys()

In [77]:
# click section
land_region = filters.find_element_by_xpath("//*[text()='Land Regions']")
land_region.click()
# iterate all countrys
for region_name in OECD_COUNTRIES:
    print(f'region: {region_name}')
    # open section for current region
    filter_region = filters.find_element_by_xpath(f"//*[text()='{region_name}']")
    click_filter(driver, filter_region, offset=True)

    for country_name in OECD_COUNTRIES[region_name]:
        print(f'loading data for country: {country_name}')
        click_country_filter(filters, country_name)
        # load the whole content
        load_all(driver)
        # extract content
        # todo: find better method to wait for results to be loaded
        sleep(10)
        content = extract_content(driver)
        # save to csv
        file_path = (OUTPUT_PATH / country_name.replace(' ', '_')).with_suffix('.csv')
        content.to_csv(file_path, index=False)
        # uncheck country
        click_country_filter(filters, country_name)
    # get back to all regions
    click_return_regions(filters)
    sleep(2)

region: Oceania


KeyError: 'Europe'

In [12]:
# first time with click
regions = get_regions(filters, click=True)
num_regions = len(regions)
# iterate all regions
#for i in range(num_regions):
for i in range(2): # only first to for testing
    regions = get_regions(filters)
    click_filter(driver, regions[i], offset=True)
    sleep(2)

    # iterate over all countries per region
    countries = get_countries(filters)
    num_countries = len(countries)
    #for j in range(num_countries):
    for j in range(2): # only first to for testing
        countries = get_countries(filters)
        click_filter(driver, countries[j])
        # todo: fix country name
        country_name = countries[j].text.split('(')[0].rstrip()
        # load the whole content
        load_all(driver)
        # extract content
        # todo: find better method to wait for results to be loaded
        sleep(10)
        content = extract_content(driver)
        # save to csv
        file_path = (OUTPUT_PATH / country_name.replace(' ', '_')).with_suffix('.csv')
        content.to_csv(file_path, index=False)
        # uncheck country
        click_filter(driver, countries[j])
        sleep(2)

    # get back to all regions
    click_return_regions(filters)
    sleep(2)

Austria
Belgium
Czechia
Denmark
Finland
France
Germany
Greece
Hungary
Iceland
Ireland
Italy
Luxembourg
Netherlands
Norway
Poland
Portugal
Slovakia
Spain
Sweden
Switzerland
United Kingdom
Australia
New Zealand
Canada
United States
Japan
Korea, Republic of
Mexico


In [3]:
import pandas as pd
from pathlib import Path

In [4]:
DATA_PATH = Path('./data')
PROTECTED_AREAS = DATA_PATH / 'OECD' / 'PROTECTED_AREAS.csv'

df = pd.read_csv(PROTECTED_AREAS)

In [12]:
df.columns

Index(['COU', 'Country', 'DESIG', 'Designation', 'DOMAIN', 'Domain', 'MEASURE',
       'Measure', 'CALCULATION', 'Calculation method', 'SCOPE', 'Scope', 'YEA',
       'Year', 'Unit Code', 'Unit', 'PowerCode Code', 'PowerCode',
       'Reference Period Code', 'Reference Period', 'Value', 'Flag Codes',
       'Flags'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,COU,Country,DESIG,Designation,DOMAIN,Domain,MEASURE,Measure,CALCULATION,Calculation method,...,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,ALL_INC_POINTS,"All, including data recorded as points",TERRESTRIAL,Terrestrial,PCNT,Percent of total land/EEZ area,NAIVE,Total for category,...,1970,PC,Percentage,0,Units,,,1.08,,
1,AUS,Australia,ALL_INC_POINTS,"All, including data recorded as points",TERRESTRIAL,Terrestrial,PCNT,Percent of total land/EEZ area,NAIVE,Total for category,...,1980,PC,Percentage,0,Units,,,2.81,,
2,AUS,Australia,ALL_INC_POINTS,"All, including data recorded as points",TERRESTRIAL,Terrestrial,PCNT,Percent of total land/EEZ area,NAIVE,Total for category,...,1990,PC,Percentage,0,Units,,,5.06,,
3,AUS,Australia,ALL_INC_POINTS,"All, including data recorded as points",TERRESTRIAL,Terrestrial,PCNT,Percent of total land/EEZ area,NAIVE,Total for category,...,1995,PC,Percentage,0,Units,,,6.11,,
4,AUS,Australia,ALL_INC_POINTS,"All, including data recorded as points",TERRESTRIAL,Terrestrial,PCNT,Percent of total land/EEZ area,NAIVE,Total for category,...,2000,PC,Percentage,0,Units,,,7.06,,


In [10]:
df.Country.unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Iceland', 'Ireland', 'Italy', 'Japan', 'Korea', 'Luxembourg',
       'Mexico', 'Netherlands', 'New Zealand', 'Norway', 'Poland',
       'Portugal', 'Slovak Republic', 'Spain', 'Sweden', 'Switzerland',
       'United Kingdom', 'United States', 'G7', 'NAFTA', 'American Samoa',
       'Anguilla', 'Argentina', 'Armenia', 'Aruba', 'Azerbaijan',
       'Belarus', 'Bermuda', 'Brazil', 'British Virgin Islands',
       'Bulgaria', 'Cayman Islands', 'Chile',
       "China (People's Republic of)", 'Colombia', 'Cook Islands',
       'Costa Rica', 'Croatia', 'Cyprus', 'Estonia', 'Faeroe Islands',
       'Falkland Islands (Malvinas)', 'French Guiana', 'French Polynesia',
       'Georgia', 'Greenland', 'Guadeloupe', 'Guam', 'India', 'Indonesia',
       'Israel', 'Kazakhstan', 'Kyrgyzstan', 'Latvia', 'Lithuania',
       'Malta', 'Martinique', 'Mayotte', '