In [1]:
import pandas as pd
import numpy as np

## Read in new species to upload

In [2]:
unmatched_taxa = pd.read_csv('processed_data/taxa_unmatched.tsv', sep='\t')
unmatched_taxa.head()

Unnamed: 0,Phylum,Class,Order,Family,Subfamily,Genus,Scientific Name,tax_count,Species,BOLD Name
0,Arthropoda,Insecta,Diptera,Acroceridae,Acrocerinae,Ogcodes,Ogcodes dispar,1,dispar,Ogcodes dispar
1,Arthropoda,Insecta,Diptera,Acroceridae,Acrocerinae,Turbopsebius,Turbopsebius brunnipennis,1,brunnipennis,Turbopsebius brunnipennis
2,Arthropoda,Insecta,Diptera,Acroceridae,Panopinae,Apsona,Apsona muscaria,1,muscaria,Apsona muscaria
3,Arthropoda,Insecta,Diptera,Acroceridae,Philopotinae,Megalybus,Megalybus pictus,1,pictus,Megalybus pictus
4,Arthropoda,Insecta,Diptera,Anthomyiidae,Anthomyiinae,Lasiomma,Lasiomma collini,1,collini,Lasiomma collini


In [3]:
taxa_to_search = unmatched_taxa[unmatched_taxa['Species'] != 'unidentified']
len(taxa_to_search)

388

## Use GBIF webservice to find authors for these new species

In [7]:
import requests
from unidecode import unidecode

In [8]:
def gbif_author_search(name):
    search_url = 'http://api.gbif.org/v1/species/match'
    params = {'kingdom':'Animalia',
              'name':name,
              'verbose':'true'}
    r = requests.get(search_url, params)
    search_results = r.json()
    confidence = search_results['confidence']
    if (int(confidence) > 90) and (search_results['matchType'] == 'EXACT'):
        gbif_id = search_results['usageKey']
        fetch_url = 'http://api.gbif.org/v1/species/{}'.format(gbif_id)
        r = requests.get(fetch_url)
        fetch_results = r.json()
        authorship = unidecode(fetch_results['authorship'])
        return authorship
    return np.nan

In [9]:
taxa_to_search['gbif_authorship'] = taxa_to_search['BOLD Name'].apply(gbif_author_search)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
taxa_to_search.head()

Unnamed: 0,Phylum,Class,Order,Family,Subfamily,Genus,Scientific Name,tax_count,Species,BOLD Name,gbif_authorship
0,Arthropoda,Insecta,Diptera,Acroceridae,Acrocerinae,Ogcodes,Ogcodes dispar,1,dispar,Ogcodes dispar,"(Macquart, 1855)"
1,Arthropoda,Insecta,Diptera,Acroceridae,Acrocerinae,Turbopsebius,Turbopsebius brunnipennis,1,brunnipennis,Turbopsebius brunnipennis,"(Sabrosky, 1948)"
2,Arthropoda,Insecta,Diptera,Acroceridae,Panopinae,Apsona,Apsona muscaria,1,muscaria,Apsona muscaria,"Westwood, 1876"
3,Arthropoda,Insecta,Diptera,Acroceridae,Philopotinae,Megalybus,Megalybus pictus,1,pictus,Megalybus pictus,"Philippi, 1865"
4,Arthropoda,Insecta,Diptera,Anthomyiidae,Anthomyiinae,Lasiomma,Lasiomma collini,1,collini,Lasiomma collini,"(Ringdahl, 1929)"


In [11]:
len(taxa_to_search[pd.isnull(taxa_to_search['gbif_authorship'])])

17

In [12]:
taxa_to_search[pd.isnull(taxa_to_search['gbif_authorship'])].head()

Unnamed: 0,Phylum,Class,Order,Family,Subfamily,Genus,Scientific Name,tax_count,Species,BOLD Name,gbif_authorship
76,Arthropoda,Insecta,Diptera,Asilidae,Ommatiinae,Emphysomera,Emphysomera nigra,2,nigra,Emphysomera nigra,
77,Arthropoda,Insecta,Diptera,Asilidae,Ommatiinae,Longibeccus,Longibeccus fuscovittatus,1,fuscovittatus,Longibeccus fuscovittatus,
92,Arthropoda,Insecta,Diptera,Calliphoridae,Ameniinae,Stilbomyella,Stilbomyella nigrocostalis,1,nigrocostalis,Stilbomyella nigrocostalis,
118,Arthropoda,Insecta,Diptera,Calliphoridae,Mesembrinellinae,Souzalopesiella,Souzalopesiella facialis,1,facialis,Souzalopesiella facialis,
127,Arthropoda,Insecta,Diptera,Muscidae,Coenosiinae,Syllimnophora,Syllimnophora atrovottata,1,atrovottata,Syllimnophora atrovottata,


## Export new author names

In [13]:
authors_found = taxa_to_search[pd.notnull(taxa_to_search['gbif_authorship'])]
len(authors_found)

371

In [14]:
authors_found.to_csv('processed_data/taxa_unmatched_authors_found.tsv',
                     index = False, sep='\t')

## Export species whose authors could not be found

In [15]:
authors_lost = taxa_to_search[pd.isnull(taxa_to_search['gbif_authorship'])]
len(authors_lost)

17

In [16]:
authors_lost.to_csv('processed_data/taxa_unmatched_authors_lost.tsv',
                     index = False, sep='\t')