In [1]:
from taxonerd import TaxoNERD

In [2]:
taxonerd = TaxoNERD(prefer_gpu=False)

In [3]:
#!pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_biobert-1.1.0.tar.gz
#!pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_md-1.1.0.tar.gz
#!pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_md_weak-1.1.0.tar.gz
#!pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.4/en_ner_eco_biobert_weak-1.1.0.tar.gz

In [4]:
nlp = taxonerd.load("en_ner_eco_biobert")
nlp.pipe_names

['transformer',
 'tagger',
 'attribute_ruler',
 'lemmatizer',
 'pysbd_sentencizer',
 'parser',
 'ner',
 'taxo_abbrev_detector']

In [5]:
# In this case we have one scientific name
text = """This dataset contains dental microwear surfaces specimens of Gazellospira torticornis from Dafnero, Greece. The material is housed at the Laboratory of Geology and Palaeontology, School of Geology, Aristotle University of Thessaloniki, 54124 Thessaloniki, Greece. More details in the read.me file."""
# No scientific name
test2 = """ A gridded sea surface salinity data set for the Pacific Ocean Lien vers les données
The gridded Sea Surface Salinity (SSS) data set covers the region between 120°E – 70°W and 30°N – 30°S in the Pacific Ocean. It is based on available data collected from 1950 to 2009 mostly from Voluntary Observing Ships, TAO/TRITON moorings and Argo profilers, with complementary hydrocasts, STD, and CTD data collected during research cruises, and subsequently validated. This monthly SSS product is gridded using an objective mapping at the spatial resolution 1° x 1°. It is distributed with its associated error fields. It is an update of the SSS product presented in Delcroix et al (2011)."""
test3 = """Composition de la richesse spécifique de la macrofaune regroupée par groupes trophiques observée par vidéo rotative STAVIRO par station en Nouvelle-Calédonie Composition de la richesse spécifique totale de chaque station observée par vidéo rotative STAVIRO par groupes trophiques (nombre d’espèces par station observé dans un rayon de 10m autour du système STAVIRO)."""
# More than one scientific name
test4="""Dental Microwear textures of extant ruminants from the Bauges Regional Natural Park, France Lien vers les données
This dataset contains dental microwear surfaces disto-buccal facets of the lower second molars of extant wild-sampled specimens of roe deer (Capreolus capreolus), red deer (Cervus elaphus), mouflon (Ovis gmelini) and chamois (Rupicapra rupicapra) from the Bauges Regional Natural Park, France. Material is stored at the Palevoprim Lab, University of Poitiers, France. Details are given in Merceron et al. (2021)."""
taxonerd.find_in_text(text)

Unnamed: 0,offsets,text,sent
T0,LIVB 61 85,Gazellospira torticornis,0


In [6]:
taxonerd.find_in_text(test2)

In [7]:
taxonerd.find_in_text(test3)

In [8]:
taxonerd.find_in_text(test4)

Unnamed: 0,offsets,text,sent
T0,LIVB 36 45,ruminants,0
T1,LIVB 245 253,roe deer,1
T2,LIVB 255 274,Capreolus capreolus,1
T3,LIVB 277 285,red deer,1
T4,LIVB 287 301,Cervus elaphus,1
T5,LIVB 304 311,mouflon,1
T6,LIVB 313 325,Ovis gmelini,1
T7,LIVB 331 338,chamois,1
T8,LIVB 340 359,Rupicapra rupicapra,1


In [9]:
x=taxonerd.find_in_text(test4)

In [10]:
x['text']

T0              ruminants
T1               roe deer
T2    Capreolus capreolus
T3               red deer
T4         Cervus elaphus
T5                mouflon
T6           Ovis gmelini
T7                chamois
T8    Rupicapra rupicapra
Name: text, dtype: object

In [11]:
import pandas as pd
import re
from collections import OrderedDict
df=pd.read_csv('Data.csv',delimiter=',',on_bad_lines="skip")

In [12]:
df.head(1)

Unnamed: 0,Data (link pndb),Title,Description,Species,Gbif link,Where to find the dataset
0,https://pndb.opendatasoft.com/explore/assets/a...,Abondance d'huître plate (Ostrea edulis) obser...,Lien vers les donnéesL'abondance d'huître plat...,Ostrea edulis\r\n,https://www.gbif.org/species/2286060,https://sextant.ifremer.fr/geonetwork/srv/fre/...


This function concatenate the title and the description, after we apply a taxonerd algortithms for find scientific names

In [13]:
pattern = r'^[A-Z][a-z]+ [a-z]+$'   # Taxon name

def extract_entities(row):
    text = f"{row['Title']} {row['Description']}"
    
    doc = taxonerd.find_in_text(text)  
    
    if doc is None or len(doc) == 0:
        return "couldn't find"

    taxons = [
        t for t in doc['text']
        if isinstance(t, str) and re.match(pattern, t)
    ] 
    # Supprimer les doublons 
    unique_taxons = list(OrderedDict.fromkeys(taxons))
    
    if len(unique_taxons) == 0:
        return "couldn't find"
    return taxons

In [14]:
df["Taxon"] = df.apply(extract_entities, axis=1)

In [15]:
df.head(1)

Unnamed: 0,Data (link pndb),Title,Description,Species,Gbif link,Where to find the dataset,Taxon
0,https://pndb.opendatasoft.com/explore/assets/a...,Abondance d'huître plate (Ostrea edulis) obser...,Lien vers les donnéesL'abondance d'huître plat...,Ostrea edulis\r\n,https://www.gbif.org/species/2286060,https://sextant.ifremer.fr/geonetwork/srv/fre/...,"[Ostrea edulis, Ostrea edulis]"


In [16]:
df.head()

Unnamed: 0,Data (link pndb),Title,Description,Species,Gbif link,Where to find the dataset,Taxon
0,https://pndb.opendatasoft.com/explore/assets/a...,Abondance d'huître plate (Ostrea edulis) obser...,Lien vers les donnéesL'abondance d'huître plat...,Ostrea edulis\r\n,https://www.gbif.org/species/2286060,https://sextant.ifremer.fr/geonetwork/srv/fre/...,"[Ostrea edulis, Ostrea edulis]"
1,https://pndb.opendatasoft.com/explore/assets/s...,Suivi de l'ichtyofaune de la Réserve Naturelle...,Lien vers les donnéesLes données concernant le...,"Epinephelus marginatus, Sciaena umbra, Diplodu...","https://www.gbif.org/species/2388507, https://...",https://cat.indores.fr/geonetwork/srv/fre/cata...,"[Epinephelus marginatus, Sciaena umbra, Diplod..."
2,https://pndb.opendatasoft.com/explore/assets/v...,Variabilité des traits et de l'environnement s...,"Trait and environmental (geomorphological, top...",Lyallia kerguelensis,https://www.gbif.org/species/3701799,https://cat.indores.fr/geonetwork/srv/fre/cata...,"[Lyallia kerguelensis, Lyallia kerguelensis, L..."
3,https://pndb.opendatasoft.com/explore/assets/a...,Abondance de petit tacaud (Trisopterus minutus...,Lien vers les donnéesL'abondance de petit taca...,Trisopterus minutus\r\n,https://www.gbif.org/species/2415906,https://sextant.ifremer.fr/geonetwork/srv/fre/...,"[Trisopterus minutus, Trisopterus minutus]"
4,https://pndb.opendatasoft.com/explore/assets/p...,Peuplements benthiques subtidaux de la baie de...,Lien vers les données\r\nJeu de données vector...,"Ophiocomina nigra, Ophiothrix fragilis","https://www.gbif.org/species/2275361, https://...",https://sextant.ifremer.fr/Donnees/Catalogue#/...,couldn't find


In [17]:
df.to_csv("result.csv", index=False)