# Match unique recommended names from the name server with GBIF

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import asyncio
import aiohttp
import requests
import time

In [2]:
import sys
# ADAPT FILE PATH TO GET FUNCTIONALITIES
sys.path.append('../../../../inbo-pyutils/gbif/gbif_name_match')
from gbif_species_name_match import (extract_species_information, 
                                     extract_gbif_species_names_info, 
                                     extract_gbif_accepted_key)

## Summary

We first extracted **all unique recommended species names used in recorder** (NBN-data, all surveys) with this query: [raw/recommended_nameserver_names.sql](raw/recommended_nameserver_names.sql). The resulting data is stored in [raw/recommended_nameserver_names.csv](raw/recommended_nameserver_names.csv). We then match those names with the GBIF backbone taxonomy, which will allow us to query our names on GBIF `acceptedKey`s. The result of this match is stored in [interim/recommended_nameserver_names_with_gbif_match.csv](interim/recommended_nameserver_names_with_gbif_match.csv). Since there are so many names, we will match the names with the [GBIF match API](api.gbif.org/v1/species/match) in chunks, asynchronously.

In [3]:
recorder_names = pd.read_csv('../data/raw/recommended_nameserver_names.tsv', delimiter='\t', dtype=object)

In [4]:
recorder_names.head()

Unnamed: 0,RECOMMENDED_TAXON_VERSION_KEY,scientificName,TAXON_GROUP_NAME
0,NHMSYS0000456996,Caylusea,bloemplant
1,NHMSYS0000900079,Listrognathus mactator,insect - vliesvleugelige (Hymenoptera)
2,NBNSYS0100004462,Paratanytarsus dissimilis,insect - vlieg & mug (Diptera)
3,NHMSYS0000603228,Gymnodinium colymbeticum,wier
4,NHMSYS0000875969,Formicinae,insect - vliesvleugelige (Hymenoptera)


In [5]:
len(recorder_names)

144135

## Rename columns

In [6]:
rename_columns = {
    'RECOMMENDED_TAXON_VERSION_KEY': 'nbn_recommendedTaxonVersionKey',
    'scientificName': 'nbn_scientificName',
    'TAXON_GROUP_NAME': 'nbn_taxonGroup'
}

In [7]:
recorder_names.rename(columns=rename_columns, inplace=True)

## Add kingdom information

GBIF names matching yields better results if kingdom information can be provided. We therefor map our informal group information in `TAXON_GROUP_NAME` to a kingdom:

In [8]:
kingdom_mapping = {'insect - kever (Coleoptera)' : 'Animalia',
                   'conifeer' : 'Plantae', 
                   'beenvis (Actinopterygii)' : 'Animalia', 
                   'vogel' : 'Animalia', 
                   'insect - nachtvlinder' : 'Animalia', 
                   'bloemplant' : 'Plantae',
                   'spin (Araneae)' : 'Animalia',
                   'insect - vlieg & mug (Diptera)' : 'Animalia', 
                   'diatomee' : 'Plantae',
                   'mollusk (Mollusca)' : 'Animalia',
                   'insect - dagvlinder' : 'Animalia', 
                   'insect - kokerjuffer (Trichoptera)' : 'Animalia',
                   'ringworm' : 'Animalia',
                   'mos' : 'Plantae',
                   'insect - wants, cicade, bladluis (Hemiptera)' : 'Animalia', 
                   'mijt (Acari)' : 'Animalia', 
                   'varen' : 'Plantae', 
                   'schaaldier' : 'Animalia', 
                   'landzoogdier' : 'Animalia', 
                   'kranswier' : 'Plantae',
                   'insect - sprinkhaan & krekel (Orthoptera)' : 'Animalia', 
                   'korstmos' : 'Plantae',
                   'platworm (Turbellaria)' : 'Animalia',
                   'insect - libel (Odonata)' : 'Animalia', 
                   'wier' : 'Plantae', 'paardenstaart' : 'Plantae',
                   'niet toegewezen': None, 
                   'levermos' : 'Plantae', 
                   'tweestaart (Diplura)' : 'Animalia',
                   'rondbek (Agnatha)' : 'Animalia', 
                   'duizendpoot' : 'Animalia', 
                   'wolfsklauw' : 'Plantae', 
                   'amfibie' : 'Animalia', 
                   'bacterie' : 'Bacteria',
                   'insect - nachtvlinder' : 'Animalia',
                   'reptiel' : 'Animalia',
                   'insect - vliesvleugelige (Hymenoptera)' : 'Animalia',
                   'zwam' : 'Fungi',
                   'kraakbeenvis  (Chondrichthyes)' : 'Animalia',
                   'rondworm (Nematoda)' : 'Animalia',
                   'virus' : 'Viruses',
                   'insect - steenvlieg (Plecoptera)' : 'Animalia',
                   'zeezoogdier' : 'Animalia',
                   'ginkgo' : 'Plantae',
                   'parasitaire nematode (Nematoda)' : 'Animalia',
                   'zuigworm (Trematoda)' : 'Animalia',
                   'biesvaren' : 'Plantae',
                   'neteldier (=cnidarian)' : 'Animalia',
                   'insect - trips (Thysanoptera)'  : 'Animalia',
                   'oerdiertje (Protozoa)' : 'Protozoa',
                   'hauwmos' : 'Plantae',
                   'lintworm (Cestoda)' : 'Animalia',
                   'manteldiertje (Urochordata)' : 'Animalia',
                   'stekelhuidige' : 'Animalia',
                   'ectoparasitaire platworm' : 'Animalia',
                   'ribkwal (Ctenophora)' : 'Animalia',
                   'mosdiertje (Bryozoa)' : 'Animalia',
                   'pseudoschorpioen (Pseudoscorpiones)' : 'Animalia',
                   'raderdiertje (Rotifera)' : 'Animalia',
                   'springstaart (Collembola)' : 'Animalia',
                   'foraminifeer' : 'Protozoa',
                   'insect - vlo (Siphonaptera)' : 'Animalia',
                   'fungoide' : 'Fungi', 
                   'buikhaarworm (Gastrotricha)' : 'Animalia',
                   'miljoenpoot' : 'Animalia',
                   'slurfworm (Kinorhyncha)' : 'Animalia',
                   'insect - haft (Ephemeroptera)' : 'Animalia',
                   'slijmzwam' : 'Protozoa',
                   'spons (Porifera)' : 'Animalia',
                   'slurfworm (Echiura)' : 'Animalia',
                   'zeespin (Pycnogonida)' : 'Animalia',
                   'spuitworm (Sipuncula)' : 'Animalia',
                   'insect - gaasvlieg (Neuroptera)' : 'Animalia',
                   'insect - houtluis (Psocoptera)' : 'Animalia',
                   'insect - waaiervleugelige (Strepsiptera)' : 'Animalia',
                   'kelkworm (Entoprocta)' : 'Animalia',
                   'beerdiertje (Tardigrada)' : 'Animalia',
                   'brachiopood (Brachiopoda)' : 'Animalia',
                   'baardworm (Pogonophora)' : 'Animalia',
                   'snoerworm (Nemertinea)' : 'Animalia',
                   'insect - oorworm (Dermaptera)' : 'Animalia',
                   'insect - luis (Phthiraptera)' : 'Animalia',
                   'kaakworm (Gnathostomulida)' : 'Animalia',
                   'pauropood' : 'Animalia',
                   'oerinsect (Protura)' : 'Animalia',
                   'loricifeer' : 'Animalia',
                   'insect - schorpioenvlieg (Mecoptera)' : 'Animalia',
                   'paardenhaarworm (Nematomorpha)' : 'Animalia',
                   'insect - kakkerlak (Dictyoptera)' : 'Animalia',
                   'priapulid' : 'Animalia',
                   'insect - zilvervis (Thysanura)' : 'Animalia',
                   'eikelworm (Hemichordata)' : 'Animalia',
                   'hooiwagen (Opiliones)' : 'Animalia',
                   'middendiertje (Mesozoa)' : 'Animalia',
                   'pijlworm (Chaetognatha)' : 'Animalia',
                   'insect - rotsspringer (Archaeognatha)' : 'Animalia',
                   'dwergpotige (Symphyla)' : 'Animalia',
                   'insect - kameelhalsvlieg (Raphidioptera)' : 'Animalia',
                   'hoefijzerworm (Phoronida)' : 'Animalia',
                   'insect - wandelende tak (Phasmida)' : 'Animalia',
                   'lipklever (Cycliophora)' : 'Animalia',
                   'insect - elzenvlieg (Megaloptera)' : 'Animalia',
                   'schorpioen' : 'Animalia',
                   'insect - bidsprinkhaan (Mantodea)' : 'Animalia',
                   'ongedetermineerd' : None,
                   np.nan : None
                    }

In [9]:
recorder_names['nbn_kingdom'] = recorder_names['nbn_taxonGroup'].replace(kingdom_mapping)

In [10]:
recorder_names['nbn_kingdom'].unique()

array(['Plantae', 'Animalia', 'Protozoa', 'Fungi', 'Bacteria', None,
       'Viruses'], dtype=object)

In [11]:
recorder_names.head()

Unnamed: 0,nbn_recommendedTaxonVersionKey,nbn_scientificName,nbn_taxonGroup,nbn_kingdom
0,NHMSYS0000456996,Caylusea,bloemplant,Plantae
1,NHMSYS0000900079,Listrognathus mactator,insect - vliesvleugelige (Hymenoptera),Animalia
2,NBNSYS0100004462,Paratanytarsus dissimilis,insect - vlieg & mug (Diptera),Animalia
3,NHMSYS0000603228,Gymnodinium colymbeticum,wier,Plantae
4,NHMSYS0000875969,Formicinae,insect - vliesvleugelige (Hymenoptera),Animalia


## Asynchronuous requesting

As we went to request over 140.000 different names to the GBIF backbone, a asynchronuous requesting should be possible in order to shorten the time to retrieve the results:

### Minimal working example

The concept of an asyncronious request of the API as example:

In [12]:
class Collector():
    def __init__(self):
        self.data = {}

    async def fetch(self, session, name, kingdom, idx):
        parameters = {'verbose': False, 'strict': True, 'name' : name, 'kingdom': kingdom}
        async with client.get('http://api.gbif.org/v1/species/match', params=parameters) as resp:
            if resp.status != 200:
                print(await resp.reason)
            else:
                # print(await resp.json())
                self.data[idx] = await resp.json()

In [13]:
testcase = recorder_names.iloc[:100,:]

In [14]:
names = testcase['nbn_scientificName'].values.tolist()
kingdoms = testcase['nbn_kingdom'].values.tolist()
inbo_ids = testcase['nbn_recommendedTaxonVersionKey'].values.tolist()

In [15]:
test = Collector()
with aiohttp.ClientSession() as client:
    asyncio.get_event_loop().run_until_complete(asyncio.gather(
            *[test.fetch(client, name, kingdom, idx) for name, kingdom, idx in zip(names, kingdoms, inbo_ids)], 
            return_exceptions=True))

Single result:

In [16]:
test.data['BMSSYS0000002618']

{'canonicalName': 'Bolbitius conocephalus',
 'class': 'Agaricomycetes',
 'classKey': 186,
 'confidence': 100,
 'family': 'Bolbitiaceae',
 'familyKey': 8440,
 'genus': 'Galerella',
 'genusKey': 8617918,
 'kingdom': 'Fungi',
 'kingdomKey': 5,
 'matchType': 'EXACT',
 'order': 'Agaricales',
 'orderKey': 1499,
 'phylum': 'Basidiomycota',
 'phylumKey': 34,
 'rank': 'SPECIES',
 'scientificName': 'Bolbitius conocephalus (Bull.) Fr., 1874',
 'species': 'Galerella conocephala',
 'speciesKey': 5449507,
 'status': 'SYNONYM',
 'synonym': True,
 'usageKey': 2529985}

### Chunked version execution

**BEWARE: THIS STEP WILL TAKE TIME.** To skip it, change the cell type to raw text.

Let's try it in chunks of around 1000 requests... in one single batch:

Combining the three batches (no longer necessary for a single batch):

In [17]:
recorder_names_matched = pd.read_csv('../data/interim/recorder_0_till_144135.csv', dtype=object)

In [18]:
recorder_names_matched.head()

Unnamed: 0.1,Unnamed: 0,canonicalName,class,classKey,confidence,family,familyKey,genus,genusKey,kingdom,...,orderKey,phylum,phylumKey,rank,scientificName,species,speciesKey,status,synonym,usageKey
0,BFN0017900000007,Hyloniscus riparius,Malacostraca,229.0,100,Trichoniscidae,5764.0,Hyloniscus,2208506.0,Animalia,...,643.0,Arthropoda,54.0,SPECIES,"Hyloniscus riparius (Koch, 1838)",Hyloniscus riparius,2208537.0,ACCEPTED,False,2208537.0
1,BFN001790000000A,,,,100,,,,,,...,,,,,,,,,False,
2,BFN001790000000B,,,,100,,,,,,...,,,,,,,,,False,
3,BFN001790000000L,,,,100,,,,,,...,,,,,,,,,False,
4,BFN001790000000M,,,,100,,,,,,...,,,,,,,,,False,


In [19]:
len(recorder_names_matched)

144134

### Retrying the missing requests 

Some request might have failed, returning into fewer records in `result` than in the original file (144.135 rows). Let's compare them.

In [20]:
recorder_names_merge = pd.merge(recorder_names, recorder_names_matched, left_on='nbn_recommendedTaxonVersionKey', right_on='Unnamed: 0', how='left')

In [21]:
missing_matches = recorder_names_merge[recorder_names_merge['Unnamed: 0'].isnull()][['nbn_recommendedTaxonVersionKey', 'nbn_scientificName', 'nbn_kingdom']]

In [22]:
missing_matches.head(10)

Unnamed: 0,nbn_recommendedTaxonVersionKey,nbn_scientificName,nbn_kingdom
133761,NHMSYS0020110590,Taraxacum amarellum,Plantae


In [23]:
additional_matches = {}
for idx, row in missing_matches.iterrows():
    additional_matches[row['nbn_recommendedTaxonVersionKey']] = extract_gbif_species_names_info(row['nbn_scientificName'], row['nbn_kingdom'])

In [24]:
additional_matches_df = pd.DataFrame(additional_matches).transpose()

In [25]:
additional_matches_df = additional_matches_df.reset_index()
additional_matches_df = additional_matches_df.rename(columns={'index' : 'Unnamed: 0'})

Add missing records to data frame:

In [26]:
recorder_names_matched = pd.concat((additional_matches_df, recorder_names_matched))

Merge again:

In [27]:
recorder_names_merge = pd.merge(recorder_names, recorder_names_matched, left_on='nbn_recommendedTaxonVersionKey', right_on='Unnamed: 0', how='left')

Number of missing records (should be 0):

In [28]:
len(recorder_names_merge[recorder_names_merge['Unnamed: 0'].isnull()])

0

## Restructure columns (remove, rename, reorder)

In [29]:
recorder_names_merge.columns

Index(['nbn_recommendedTaxonVersionKey', 'nbn_scientificName',
       'nbn_taxonGroup', 'nbn_kingdom', 'Unnamed: 0', 'canonicalName', 'class',
       'classKey', 'confidence', 'family', 'familyKey', 'genus', 'genusKey',
       'kingdom', 'kingdomKey', 'matchType', 'note', 'order', 'orderKey',
       'phylum', 'phylumKey', 'rank', 'scientificName', 'species',
       'speciesKey', 'status', 'synonym', 'usageKey'],
      dtype='object')

In [30]:
drop_columns = [
    'Unnamed: 0',
    'kingdomKey',
    'synonym',
    'confidence',
    'phylum', 'phylumKey',
    'class', 'classKey',
    'order', 'orderKey',
    'family', 'familyKey',
    'genus', 'genusKey',
    'species', 'speciesKey'
]

In [31]:
recorder_names_merge.drop(drop_columns, axis=1, inplace=True)

In [32]:
rename_columns = {
    'usageKey': 'gbifapi_usageKey',
    'scientificName': 'gbifapi_scientificName',
    'canonicalName': 'gbifapi_canonicalName',
    'status': 'gbifapi_status',
    'rank': 'gbifapi_rank',
    'matchType': 'gbifapi_matchType',
    'note': 'gbifapi_note',
    'kingdom': 'gbifapi_kingdom'
}

In [33]:
recorder_names_merge.rename(columns=rename_columns, inplace=True)

In [34]:
recorder_names_merge['nameMatchValidation'] = np.nan
recorder_names_merge['gbifapi_acceptedKey'] = np.nan
recorder_names_merge['gbifapi_acceptedScientificName'] = np.nan

In [37]:
reorder_columns = [
    'nbn_recommendedTaxonVersionKey',
    'nameMatchValidation',
    'nbn_scientificName',
    'nbn_taxonGroup',
    'nbn_kingdom',
    'gbifapi_kingdom',
    'gbifapi_usageKey',
    'gbifapi_scientificName',
    'gbifapi_canonicalName',
    'gbifapi_status',
    'gbifapi_rank',
    'gbifapi_matchType',
    'gbifapi_note',
    'gbifapi_acceptedKey',
    'gbifapi_acceptedScientificName',
]

In [38]:
recorder_names_merge = recorder_names_merge[reorder_columns].copy()

In [39]:
recorder_names_merge.head(2)

Unnamed: 0,nbn_recommendedTaxonVersionKey,nameMatchValidation,nbn_scientificName,nbn_taxonGroup,nbn_kingdom,gbifapi_kingdom,gbifapi_usageKey,gbifapi_scientificName,gbifapi_canonicalName,gbifapi_status,gbifapi_rank,gbifapi_matchType,gbifapi_note,gbifapi_acceptedKey,gbifapi_acceptedScientificName
0,NHMSYS0000456996,,Caylusea,bloemplant,Plantae,Plantae,7275943,Caylusea A. St.-Hil.,Caylusea,ACCEPTED,GENUS,EXACT,,,
1,NHMSYS0000900079,,Listrognathus mactator,insect - vliesvleugelige (Hymenoptera),Animalia,Animalia,1306714,"Listrognathus mactator (Thunberg, 1822)",Listrognathus mactator,ACCEPTED,SPECIES,EXACT,,,


## Getting the acceptedKey for synonyms

By default, set `acceptedKey` and `acceptedScientificName` identical to `usageKey` and `scientificName`. These values will be overwritten for synonyms.

In [40]:
recorder_names_merge['gbifapi_acceptedKey'] = recorder_names_merge['gbifapi_usageKey']
recorder_names_merge['gbifapi_acceptedScientificName'] = recorder_names_merge['gbifapi_scientificName']

Unprocessed synonyms:

In [42]:
len(recorder_names_merge[(recorder_names_merge['gbifapi_status'] == 'SYNONYM') & (recorder_names_merge['gbifapi_acceptedKey'] == recorder_names_merge['gbifapi_usageKey'])])

12963

Process synonyms:

**BEWARE: THIS STEP WILL TAKE TIME.** To skip it, change the cell type to raw text.

In [80]:
for idx, row in recorder_names_merge.iterrows():
    if row['gbifapi_status'] == 'SYNONYM' and row['gbifapi_acceptedKey'] == row['gbifapi_usageKey']:
        acceptedInfo = extract_gbif_accepted_key(str(int(row['gbifapi_usageKey'])))
        recorder_names_merge.loc[idx, 'gbifapi_acceptedKey'] = acceptedInfo[0]
        recorder_names_merge.loc[idx, 'gbifapi_acceptedScientificName'] = acceptedInfo[1]

Unprocessed synonyms (should be 0):

In [81]:
len(recorder_names_merge[(recorder_names_merge['gbifapi_status'] == 'SYNONYM') & (recorder_names_merge['gbifapi_acceptedKey'] == recorder_names_merge['gbifapi_usageKey'])])

0

## Save result

In [87]:
recorder_names_merge.to_csv('../data/interim/recommended_nameserver_names_with_gbif_match.tsv', sep='\t', dtype=object, index=False)