In [7]:
from pathlib import Path
import pandas as pd
from taxon_parser import TaxonParser, UnparsableNameException

In [3]:
DATA_DIR = Path('./data')

taxa_df = pd.read_csv(DATA_DIR / 'solanaceae.csv')

In [4]:
df = taxa_df[:100]

In [143]:
s = 'Witheringia solanacea'

In [144]:
parser = TaxonParser(s)
try:
    parsed_name = parser.parse()
    print(parsed_name)
except UnparsableNameException as e:
    print("this name does not seem to be a valid taxon name: \n" + e )

[NameType.SCIENTIFIC] G:Witheringia S:solanacea R:Rank.SPECIES A:<unknown> BA:<unknown>


In [145]:
def parse_name(row):
    parser = TaxonParser(row['name'])
    try:
        parsed_name = parser.parse()
    except UnparsableNameException as e:
        print("this name does not seem to be a valid taxon name")  
    else:
        return parsed_name.genus, parsed_name.specificEpithet, parsed_name.infraspecificEpithet
        
    
    # return 1,2

taxa_df[['genus', 'specificEpithet', 'infraspecificEpithet']] = taxa_df.apply(parse_name, axis=1, result_type="expand")




this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name
this name does not seem to be a valid taxon name


In [37]:
taxa_df.to_csv(DATA_DIR / 'taxa.csv', index=False)

In [104]:
# taxon = 'Mangifera indica Linn.'
taxon = 'Capsicum annuum Alef. var. longum'

In [105]:
parser = TaxonParser(taxon)
parsed_name = parser.parse()

print(parsed_name.genus, parsed_name.specificEpithet, parsed_name.infraspecificEpithet)

Capsicum annuum longum


In [106]:
# mask = pd.Series([True] * len(taxa_df))

if parsed_name.genus and parsed_name.specificEpithet:
    mask = (taxa_df.genus == parsed_name.genus) & (taxa_df.specificEpithet == parsed_name.specificEpithet)

if parsed_name.infraspecificEpithet:
    mask &= (taxa_df.infraspecificEpithet == parsed_name.infraspecificEpithet)

result = taxa_df[mask]

In [112]:
result.synonymOf.unique()

array([2932944.])

In [113]:
if parsed_name.genus and parsed_name.specificEpithet and parsed_name.hasAuthorship():
    taxon_authorship = parsed_name.authorshipComplete()
    specificEpithetPrefix = parsed_name.specificEpithet[:3]
    mask = (
        taxa_df.genus == parsed_name.genus
    ) & (
        taxa_df.specificEpithet.str.startswith(specificEpithetPrefix)
    ) & (
        taxa_df.authorship.str.contains(taxon_authorship, case=False)
    )
    if parsed_name.infraspecificEpithet:
        mask &= (taxa_df.infraspecificEpithet == parsed_name.infraspecificEpithet)    
        
    result = taxa_df[mask]
    print(result)
    
    
    

In [82]:
if parsed_name.hasAuthorship():
    print('h')
    taxon_authorship = parsed_name.authorshipComplete()
    print(taxon_authorship)

In [76]:
if result.empty:
    print(result)

if len(result) > 1:
    # result = result
    with_author = result[result.authorship.str.contains('Alef', case=False)]
    # if len(with_author) 

Empty DataFrame
Columns: [taxonID, parent, synonymOf, name, authorship, taxonRank, status, genus, specificEpithet, infraspecificEpithet]
Index: []


In [67]:
with_author

Unnamed: 0,taxonID,parent,synonymOf,name,authorship,taxonRank,status,genus,specificEpithet,infraspecificEpithet
8516,12079039,,2932944.0,Capsicum annuum longum,(DC.) Alef.,variety,synonym,Capsicum,annuum,longum


In [122]:
def filter_taxon(genus, specific_epithet, infraspecific_epithet, authorship, prefix_specific=False):
    mask = (taxa_df.genus == genus) 
    if prefix_specific:
        mask &= taxa_df.specificEpithet.str.startswith(specific_epithet[:3])
    else:
        mask &= (taxa_df.specificEpithet == specific_epithet)
     
    if infraspecific_epithet:
        mask &= (taxa_df.infraspecificEpithet == infraspecific_epithet)

    if not prefix_specific:
        mask &= (taxa_df.infraspecificEpithet == 'FIXME')
    result = taxa_df[mask]

    if len(result) > 1 and authorship:
        with_author = result[result.authorship.str.contains(authorship, case=False)]
        if not with_author.empty:
            result = with_author

    return result
    

taxon = filter_taxon(parsed_name.genus, parsed_name.specificEpithet, parsed_name.infraspecificEpithet, 'Alef')

In [126]:
if taxon.empty and True:
    taxon = filter_taxon(parsed_name.genus, parsed_name.specificEpithet, parsed_name.infraspecificEpithet, 'Alef', True)

In [127]:
taxon

Unnamed: 0,taxonID,parent,synonymOf,name,authorship,taxonRank,status,genus,specificEpithet,infraspecificEpithet
8516,12079039,,2932944.0,Capsicum annuum longum,(DC.) Alef.,variety,synonym,Capsicum,annuum,longum


In [128]:
institutions_df = pd.read_csv(DATA_DIR / 'institutions.csv')

In [129]:
institutions_df

Unnamed: 0,uuid,code,name,country,altCodes
0,53a694c3-1c30-4aaf-a20f-593d6a791d89,CDZMTU,"Museum of Zoology, Central Department of Zoology",NP,[]
1,3448c872-18f9-4d70-8237-148cd1d87b3f,NHMTU,"Natural History Museum, Tribhuvan University",NP,[]
2,2bf398f7-44e2-4a02-8134-4f96b8c2ca95,IITABJ,The International Institute of Tropical Agricu...,BJ,[]
3,23e0159b-30ed-4968-803c-190d60e11f20,MCCNNUTA,Museo de Universidad Técnica de Ambato,EC,['MCNUTA']
4,57a6e7b4-9fde-4efb-be3b-b800a6dd72f5,QPLS,Herbario Padre Luis Sodiro (QPLS),EC,['BEAEP']
...,...,...,...,...,...
8130,fdc05902-c9e6-4119-b3a7-0624ab82b2c2,URZF,Institut National de la Recherche Agronomique ...,FR,[]
8131,fdee1b94-e933-4a6e-9a85-05cc39a085a6,BPBM,Bernice Pauahi Bishop Museum,US,[]
8132,fefe4a0a-ddf3-499d-b508-e077443dc979,AMIB,Arthropods of Medical Importance Resource Bank,KP,[]
8133,ff9e52d6-7dc2-42d9-a5d4-3352ff31461f,KNWR,"DOI/FWS, Kenai National Wildlife Refuge",US,[]


In [130]:
data = {
    'collectorname':'Steven R. Hill',
    'taxon':'Capsicum annuum L. var. longum',
    'country_location':'South Carolina',
    'ISO':'US',
    'institutionname':'Harvard University',
    'institutioncode':'HAR',
    'year':'1989'
}

collector_name = data['collectorname']
taxon_name = data['taxon']
institution_code = data['institutioncode']
institution_name = data['institutionname']
country_iso = data['ISO']
country_name = data['country_location']

In [142]:
def get_institution(institution_code, institution_name):
    
    if institution_code:
        result = institutions_df[institutions_df.code == institution_code]
        if not result.empty:
            return result.iloc[0].to_dict()
            
    if institution_name:
        result = institutions_df[institutions_df.name.str.contains(institution_name, case=False)]
        if not result.empty:
            return result.iloc[0].to_dict()        

get_insitution('AAH', institution_name)

{'uuid': 'c4dce18b-c0eb-4e76-9d46-93582222dc42',
 'code': 'AAH',
 'name': 'Arnold Arboretum, Harvard University',
 'country': 'US',
 'altCodes': '[]'}

In [135]:
if institution_name:
    institutions_df[institutions_df.name.str.contains('Harvard', case)]

In [134]:
institution_name

'Harvard University'

In [138]:
institutions_df[institutions_df.name.str.contains('harvard', case=False)]

Unnamed: 0,uuid,code,name,country,altCodes
496,219261fd-d235-4d52-acd6-ddc79dc7e870,Harvard University,"Harvard University, The Gray Herbarium",US,[]
520,2f97e96c-6555-4f10-b26f-6a5cd9bd6cba,Harvard University,"Harvard University, Arnold Arboretum of Harvar...",US,[]
524,4db8f93a-eb0e-42b1-b84a-02c052aa0486,Harvard University,"Harvard University, Economic Herbarium of Oake...",US,[]
525,bbc8f84a-40ce-4aff-9dde-c99f45616257,Harvard University,"Harvard University, The Farlow Herbarium",US,[]
550,b9cab401-2515-4017-b086-66ccaadf9370,Harvard University,"Harvard University, Oakes Ames Orchid Herbarium",US,[]
7833,1cfca87c-083e-4d67-8e60-cb7d311f6058,MCZ,"Harvard University, Museum of Comparative Zoology",US,[]
7957,c4dce18b-c0eb-4e76-9d46-93582222dc42,AAH,"Arnold Arboretum, Harvard University",US,[]
8026,1e0a49e1-2be3-4028-911c-47cf84c9aa66,A,Harvard University,US,[]
8056,648a5b5f-0421-44fd-8dd3-82add535bc54,GH,Harvard University,US,[]
8069,6ed3d3c8-e018-427e-bb25-8c5f55d8e480,AMES,Harvard University,US,[]


In [137]:
institution_code

'HAR'