In [2]:
import typer
import pandas as pd
import spacy
from pathlib import Path
from spacy.matcher import Matcher
from spacy.tokens import Span, Token
from spacy.util import filter_spans
from spacy.tokens import DocBin
from spacy import displacy
# from sklearn.model_selection import train_test_split
import re
import json

from adept.config import unit_registry
from adept.components.registry import ComponentsRegistry
from adept.utils.expand import ExpandSpan
import numpy as np
from adept.components.registry import ComponentsRegistry
from adept.components.sentencizer import Sentencizer
from adept.components.numeric import (NumericDimension, NumericExpand, NumericFraction, NumericMeasurement, NumericRange)
from adept.components.anatomical import AnatomicalEntity
from adept.components.traits import DiscreteTraitsEntity
from adept.components.traits import CustomTraitsEntity
from adept.preprocess import Preprocess
from adept.postprocess import Postproccess
from adept.traits import Traits
from adept.fields import Fields, Field
from adept.pipeline import Pipeline
from adept.config import LOG_DIR, logger, DATA_DIR, PROCESSED_DATA_DIR
from adept.utils.helpers import token_get_ent
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nlp = spacy.load("en_core_web_trf")
registry = ComponentsRegistry(nlp)
registry.add_components([
    Sentencizer,
    NumericExpand,
    NumericDimension,
    NumericMeasurement,
    NumericRange,
    NumericFraction,    
])

INIT custom_sentencizer
INIT numeric_expand
INIT numeric_dimensions
INIT numeric_measurements
^cm$|^ft$|^m$|^meter$|^metre$|^km$|^kilometer$|^kilometre$|^centimeter$|^centimetre$|^mm$|^millimeter$|^millimetre$|^um$|^micrometer$|^micrometre$|^micron$|^nm$|^nanometer$|^nanometre$|^pm$|^inch$
INIT numeric_range
INIT numeric_fraction


In [39]:
df = pd.read_csv(PROCESSED_DATA_DIR / 'af4b0c64764555e29e5f5a7847683c2f.descriptions.csv')



In [40]:
preprocess = Preprocess()

In [41]:
class EntityParser():
    # ('Who is Shaka Khan?', {
    #     'entities': [(7, 17, 'PERSON')]
    # }),  
    
    # 3-flowered etc.,     
    re_num_cadj = re.compile(r'[0-9]+\-[a-z]+')
    
    ent_labels = ['CARDINAL', 'QUANTITY']
    
    def __call__(self, doc):
        self.sents = []
        self.doc = doc

        for sent in doc.sents:
            
            
            
            self.ents = []
            self.error = False
            
            if self.sent_is_2n(sent):
                continue
                
            # try:
            #     if sent[3].text != 'basal':
            #         continue
            # except:
            #     continue
                
#             print(sent)
#             print(sent._.measurements)

            if sent._.dimensions:
                self.add_entities(sent, sent._.dimensions, 'DIMENSION')
            if sent._.measurements:
                self.add_entities(sent, sent._.measurements, 'MEASUREMENT') 
            if sent._.volume_measurements:
                self.add_entities(sent, sent._.volume_measurements, 'VOLUME')
            
            numeric_ents = [ent for ent in sent.ents if ent.label_ in ['CARDINAL', 'QUANTITY'] and not (ent[0]._.is_measurement or ent[0]._.is_dimension)]

            # Extract 3-flowered
            for m in self.re_num_cadj.finditer(sent.text): 
                span = sent.char_span(m.start(), m.end())

                # No span if the start/end doesn't align with token boundaries
                # Skip it - sent will fail with an existing numeric ent                 
                if not span:
                    continue
                    
                for token in span:
                    if ent := token_get_ent(token, self.ent_labels):
                        if ent in numeric_ents:
                            # Ensure the start chars are the same - otherwise we will have
                            # 3- *7-veined*                             
                            if ent.start_char == m.start():
                                numeric_ents.remove(ent)
                        
                self.add_entity(sent, span, 'NUMERIC_CADJ')
            
            if numeric_ents:
                for ent in numeric_ents[:]:  
                    # Match valid numbers e.g. 5-7 (-9)                     
                    if re.match(r'^[<>0-9\(\)\-\.\s]+$', ent.text):
                        # We're just going to use CARDINAL, rather than CARDINAL and QUANTITY
                        # as there's only two QUANTITY                         
                        self.add_entity(sent, ent, 'CARDINAL')
                        numeric_ents.remove(ent)  
            
            # If we removed all the numeric entities with our own, add it                 
            if not (numeric_ents or self.error) and self.ents:  
                sent_doc = sent.as_doc()
                # Remove any overlapping - e.g. if we have measurement that is also flagged as a dimension                 
                sent_doc.ents = filter_spans(self.ents)
                self.sents.append(sent_doc)
                
        return self.sents
                        
                    
    def add_entities(self, sent, spans, label):
        for span in spans:
            self.add_entity(sent, span, label)
    
    def add_entity(self, sent, span, label): 
        
        start = span.start - sent.start
        end = span.end - sent.start
        
        ent = Span(sent.as_doc(), start, end, label=label)  

        if not self.validate_parentheses(ent.text):
            print('INVALID PARENTHESES: ', ent.text)
            self.error = True
            return
        
        self.ents.append(ent)
        
    @staticmethod
    def validate_parentheses(text):
        return text.count('(') == text.count(')')
    
    @staticmethod
    def sent_is_2n(sent):
        return '2n' in [s.text for s in sent]

        
parser = EntityParser()   

# sample = df.sample(200)

# sample = df[df.taxon == 'Rotala ramosior']

train_data = []

for row in df.itertuples():
    print(row.taxon)    
    text = preprocess(row.description)   

    
    # Optional - to debug
    # print(row.taxon)     
    # displacy.render(sents, style="ent")
    
    doc = nlp(text)
    sents = parser(doc)
    
    for sent in sents:
        entities = [(ent.start_char, ent.end_char, ent.label_) for ent in sent.ents]     
        train_data.append((sent.text, {'entities': entities}))
        

# print(len(train_data))
        

            


Eleocharis palustris
INVALID PARENTHESES:  (1-4 mm
Eleocharis palustris


Error parsing range some 3-fid in styles 2-fid, very rarely some 3-fid.


INVALID PARENTHESES:  2001)
INVALID PARENTHESES:  1968)
Eleocharis palustris
Eleocharis palustris
Leersia hexandra
Phalaris arundinacea
Phalaris arundinacea
Phalaris arundinacea
Leersia oryzoides
Leersia oryzoides
Leersia oryzoides
Alopecurus aequalis
Alopecurus aequalis
Alopecurus aequalis
Ranunculus sceleratus
Ranunculus sceleratus
Ranunculus sceleratus
Ranunculus sceleratus
Potamogeton perfoliatus
Potamogeton perfoliatus
Potamogeton perfoliatus
Potamogeton perfoliatus
Eleocharis acicularis
Eleocharis acicularis
INVALID PARENTHESES:  1975)
INVALID PARENTHESES:  3)
INVALID PARENTHESES:  539)
Bolboschoenus maritimus
Bolboschoenus maritimus
Bolboschoenus maritimus
Myriophyllum verticillatum
Myriophyllum verticillatum
INVALID PARENTHESES:  1998)
Myriophyllum verticillatum
Myriophyllum verticillatum
Arundo donax
Arundo donax
Epilobium hirsutum
Epilobium hirsutum
Epilobium hirsutum
Epilobium hirsutum


Error parsing range 6-l 0 in Sepals 6-l 0 x 2-2.8 mm, apiculate, densely pubescent.
Error parsing range 3-l 0 cm in Capsules 3-l 0 cm long, on pedicels 0.5-2 cm long, villous.


Persicaria lapathifolia
Persicaria lapathifolia
Persicaria lapathifolia
Epilobium parviflorum
Epilobium parviflorum
Epilobium parviflorum
Epilobium parviflorum
Catabrosa aquatica
Catabrosa aquatica
Catabrosa aquatica
INVALID PARENTHESES:  (up to 5 cm
INVALID PARENTHESES:  (1.5-1.7 mm
Potentilla supina
Hippuris vulgaris
Hippuris vulgaris
Hippuris vulgaris
Nuphar pumila
Nuphar pumila
Potamogeton friesii
Potamogeton friesii
Potamogeton friesii
Veronica peregrina
Veronica peregrina
Ischaemum rugosum


Error parsing range one-third in Pedicelled spikelet resembling the sessile and on a pedicel one-third the length of the internode, or much reduced and on a pedicel equalling the internode.


Ischaemum rugosum
Eleocharis acutangula
Najas gracillima
Najas gracillima
Sacciolepis indica
Poa annua
Poa annua
INVALID PARENTHESES:  2)
Poa annua
Epilobium palustre
Epilobium palustre
Epilobium palustre
Epilobium palustre
Beckmannia syzigachne


Error parsing range 5 in Fertile lemma 3-3.5 mm, lanceolate, apiculate, cartilaginous, keel 0, 5 -veined.


Beckmannia syzigachne
Drosera rotundifolia
Drosera rotundifolia
Drosera rotundifolia
Eriocaulon cinereum
Eriocaulon cinereum
Plantago major
Plantago major
Plantago major
Cyperus haspan
Cyperus haspan


Error parsing range ca. 0.3 as long as subtending glume, 3-sided in Nutlet yellowish, whitish, or reddish brown, broadly obovoid, 0.5-0.6(-0.7) mm, ca. 0.3 as long as subtending glume, 3-sided, tuberculate.


Cyperus haspan
Phyllanthus reticulatus
Phyllanthus reticulatus
Paspalum distichum
Paspalum distichum
Brachiaria reptans
Vigna luteola
Schoenoplectus tabernaemontani
Schoenoplectus tabernaemontani
INVALID PARENTHESES:  1995)
Schoenoplectus tabernaemontani
INVALID PARENTHESES:  2)
Persicaria maculosa
Persicaria maculosa
INVALID PARENTHESES:  1977)
INVALID PARENTHESES:  1998)
Persicaria maculosa
Vicia sepium
Vicia sativa
INVALID PARENTHESES:  4)
Acer caudatum
Acer caudatum
Alnus nepalensis
Amorphophallus hayi
Delonix regia
Delonix regia
INVALID PARENTHESES:  ) c. 5 cm
Betula pendula
Betula pendula
Betula pendula
Zostera marina
Zostera marina
Zostera marina
Juncus articulatus
Juncus articulatus
Juncus articulatus
Juncus articulatus
INVALID PARENTHESES:  1962)
Typha angustifolia
INVALID PARENTHESES:  (1-9(-12) cm
Typha angustifolia
Typha angustifolia
Leptochloa panicea
Leptochloa panicea
Ruppia maritima
Ruppia maritima
Ruppia maritima
Ruppia maritima
Nymphaea tetragona
Nymphaea tetragona
Ny

Error parsing range ca. 0.7 as long as subtending glume, 3-sided in Nutlet straw-colored at first but dark brown when mature, narrowly oblong, 1.8-2.2 x ca. 0.5 mm, ca. 0.7 as long as subtending glume, 3-sided, minutely punctate.


INVALID PARENTHESES:  1983)
INVALID PARENTHESES:  1989)
INVALID PARENTHESES:  6(4
INVALID PARENTHESES:  1998)
Cyperus cyperoides
Eleusine indica
Eleusine indica
Centipeda minima
Hedychium coronarium
Hedychium coronarium
Alpinia zerumbet
Alpinia zerumbet
Nerium oleander
Nerium oleander
Spergularia marina
Spergularia marina
Spergularia marina
Carex distans
Carex distans
Cyperus fuscus
Cyperus fuscus
Cyperus fuscus
Cyperus fuscus
Eleocharis uniglumis
Eleocharis uniglumis
INVALID PARENTHESES:  1966)
INVALID PARENTHESES:  1966)
Eleocharis uniglumis
Eleocharis uniglumis
Isolepis setacea
Isolepis setacea
Isolepis setacea
INVALID PARENTHESES:  2)
Isolepis setacea
Polypogon viridis
Polypogon viridis


Error parsing range four-fifths in palea four-fifths to as long as the lemma;


Utricularia minor
Persicaria amphibia
INVALID PARENTHESES:  ) 1-2 mm
Persicaria amphibia
INVALID PARENTHESES:  1968)
INVALID PARENTHESES:  1968)
Persicaria amphibia
Samolus valerandi
Samolus valerandi
Samolus valerandi
INVALID PARENTHESES:  ) 3.5-7 (-20.5) x 1-2 (-4) cm
Ranunculus trichophyllus
INVALID PARENTHESES:  (2-4 cm
Ranunculus trichophyllus
Apium graveolens
Apium graveolens
Apium graveolens
Berula erecta
Berula erecta
Berula erecta
Centaurium pulchellum
Centaurium pulchellum
Centaurium pulchellum
Polygonum argyrocoleon
Polygonum argyrocoleon
Echinochloa frumentacea
Echinochloa frumentacea
Bacopa monnieri
Albizia lebbeck
Albizia lebbeck
Allium schoenoprasum
Allium schoenoprasum
Allium schoenoprasum
Allium schoenoprasum
Avena fatua
Brassica elongata
Brassica elongata
Brassica nigra
Brassica nigra
Brassica nigra
Brassica nigra
Prunus cerasifera
Prunus cerasifera
Lathyrus latifolius
Lathyrus latifolius
Daucus carota
Potamogeton natans
INVALID PARENTHESES:  ) 15-30(-80) cm < 3 mm
Po

Error parsing range half in Inflorescence < 7(-10) flowers forming a compact head, usually placed half-way or lower down the apparent stem.


Juncus filiformis
Juncus filiformis
Juncus alpinoarticulatus
Juncus alpinoarticulatus
Scrophularia umbrosa
Scrophularia umbrosa
Sonchus palustris
Sonchus palustris
Sonchus palustris
INVALID PARENTHESES:  70-90)
Angelica sylvestris
Angelica sylvestris
Pedicularis palustris
Pedicularis palustris
Salix caprea
Eriophorum vaginatum
Eriophorum vaginatum
Eriophorum vaginatum
Carex microglochin
Carex microglochin
Carex microglochin
Salix cinerea
Salix cinerea
INVALID PARENTHESES:  1950)
INVALID PARENTHESES:  1986)
INVALID PARENTHESES:  1999)
INVALID PARENTHESES:  1988)
Salix cinerea
Veronica serpyllifolia
Veronica serpyllifolia
Pinguicula vulgaris
Utricularia intermedia
Utricularia intermedia
Hydrocotyle sibthorpioides
Hydrocotyle sibthorpioides
Acacia auriculiformis
Acacia auriculiformis
Bauhinia purpurea
Bauhinia purpurea
Bauhinia variegata
Bauhinia variegata
Cercis chinensis
Crotalaria micans
Dichrostachys cinerea
Erythrina variegata
Lespedeza cuneata
Lespedeza cuneata
Lespedeza thunbergii


Error parsing range 10 (-12) mm in pedicels up to 10 (-12) mm long in fruit, spreading.


Carex lasiocarpa
Carex lasiocarpa
Carex lasiocarpa
Carex limosa
Carex limosa
Carex limosa
Carex pseudocyperus
Carex pseudocyperus
Carex pseudocyperus
Carex pseudocyperus
INVALID PARENTHESES:  10-40)-140) mm
Carex vesicaria
Carex vesicaria
Carex vesicaria
Eriophorum angustifolium
Eriophorum angustifolium
Eriophorum angustifolium
Schoenoplectus triqueter
Schoenoplectus triqueter
Schoenoplectus triqueter
Schoenoplectus triqueter
Agrostis stolonifera
Agrostis stolonifera
Agrostis stolonifera


Error parsing range two-thirds in palea half to two-thirds the length of the lemma;


Glyceria notata
Glyceria notata
Juncus compressus
Juncus compressus
Juncus compressus
Juncus conglomeratus
INVALID PARENTHESES:  30-100) cm
Mentha spicata
Mentha spicata
Mentha spicata
Stachys palustris
INVALID PARENTHESES:  (<5 mm
Stachys palustris
Stachys palustris
Utricularia vulgaris
Menyanthes trifoliata
Menyanthes trifoliata
Menyanthes trifoliata
Nymphoides peltata
Nymphoides peltata
Parnassia palustris
Parnassia palustris
Parnassia palustris
Parnassia palustris


Error parsing range 0.6-l. 8 cm in Radical leaves broadly cordate, 1-2 cm long, 0.6-l. 8 cm broad, somewhat glaucous, cauline leaf situated below the middle of the scape.


Persicaria hydropiper
Persicaria hydropiper
Persicaria hydropiper
Lysimachia vulgaris
Lysimachia vulgaris
Lysimachia vulgaris
Lysimachia vulgaris


Error parsing range c. 3.8 mm long, ovate-lanceolate, margin glandular streaked, membranous and glandular-ciliolate. in lobes c. 3.8 mm long, ovate-lanceolate, margin glandular streaked, membranous and glandular-ciliolate.
Error parsing range c. 3.8 mm long, ovate-lanceolate, margin glandular streaked, membranous and glandular-ciliolate. in lobes c. 3.8 mm long, ovate-lanceolate, margin glandular streaked, membranous and glandular-ciliolate.


INVALID PARENTHESES:  (2-3
Filipendula ulmaria
Filipendula ulmaria


Error parsing range 100+ in Inflorescences 100+-flowered;


Filipendula ulmaria
Salix alba


Error parsing range 30deg - 50deg in A tree 10-25(-33) m, branches ascending at 30deg - 50deg, forming a narrow crown, appearling silver-grey in leaf, often pollarded.


Salix alba
INVALID PARENTHESES:  1993)
INVALID PARENTHESES:  1984)
INVALID PARENTHESES:  1994)
INVALID PARENTHESES:  1)
Salix alba
Salix alba
Sparganium angustifolium
Sparganium angustifolium
Sparganium angustifolium
Sparganium erectum
Sparganium natans
Sparganium natans
Sparganium natans
INVALID PARENTHESES:  3)
Cicuta virosa
Cicuta virosa
Cicuta virosa
Polypogon monspeliensis
Polypogon monspeliensis
Polypogon monspeliensis
Acorus calamus


Error parsing range c. 2 mm long, oblong-obovate, slightly curved, margin membranous, surface with embedded raphides. in Tepals 6, c. 2 mm long, oblong-obovate, slightly curved, margin membranous, surface with embedded raphides.


Acorus calamus
INVALID PARENTHESES:  ) 22.1-66.5(-73.3) cm
Acorus calamus
Acorus calamus
Juncus inflexus
Juncus inflexus
Juncus inflexus
Juncus inflexus
INVALID PARENTHESES:  40)
Najas minor
Najas minor
Najas minor
INVALID PARENTHESES:  1971)
Cypripedium guttatum


Error parsing range 2 (-3,very in Leaves 2 (-3,very rarely), on middle half of stem, alternate to subopposite, wide-spreading;


Cypripedium guttatum
Cypripedium calceolus
Cypripedium yatabeanum


Error parsing range 2 (-3 in Leaves 2 (-3, very rarely), on middle half of stem, alternate to subopposite, wide-spreading;


INVALID PARENTHESES:  2 (-3
Gloriosa superba
Platanthera tipuloides
Carex canescens
Carex canescens
Carex canescens
Carex vulpina
Carex vulpina
Phleum alpinum
Phleum alpinum
Phleum alpinum


Error parsing range two-thirds in lemma two-thirds the length of the glumes, 3-5-nerved, minutely hairy on the nerves;


Carex lachenalii
Carex lachenalii
Carex lachenalii
Brachiaria subquadripara
Setaria parviflora
Carex remota
Poa angustifolia
Poa angustifolia
Rubus ellipticus
Psidium guajava
Psidium guajava
Silene vulgaris
Silene vulgaris
Silene vulgaris
Silene vulgaris
Cydonia oblonga
Cydonia oblonga
Cydonia oblonga
Thespesia populnea
Thespesia populnea
Thespesia populnea
Eucalyptus grandis
Eucalyptus grandis
Salix viminalis
Elaeagnus angustifolia
Elaeagnus angustifolia
Hevea brasiliensis
Citrus medica
Fimbristylis littoralis
Ranunculus aquatilis
Ranunculus aquatilis
Fimbristylis dichotoma
Fimbristylis dichotoma
Fimbristylis dichotoma
Hydrilla verticillata
Hydrilla verticillata
Hydrilla verticillata
Hydrilla verticillata


Error parsing range c. 5 mm long, reddish-brown striped, bidentate. in Female spathe c. 5 mm long, reddish-brown striped, bidentate.


Cladium mariscus


Error parsing range (-2) 3 in Stigmas (-2) 3;


Cladium mariscus
Juncus effusus
Juncus effusus
Juncus effusus
Utricularia gibba
Cyperus alternifolius
Cyperus alternifolius


Error parsing range four-angled, c. 0.5 mm wide in rachis four-angled, c. 0.5 mm wide, internodes c. 0.5 mm, not markedly winged;


Ranunculus cymbalaria
Ranunculus cymbalaria
Agrostis clavata
Carex aperta
Carex aperta
Carex gynocrates
INVALID PARENTHESES:  (14
INVALID PARENTHESES:  (12
INVALID PARENTHESES:  (74
INVALID PARENTHESES:  1950)
INVALID PARENTHESES:  1988)
INVALID PARENTHESES:  1992)
Carex gynocrates
Carex laxa
Carex laxa
Carex loliacea
Carex loliacea
Carex michauxiana
Carex stipata
Carex stipata
Carex tenuiflora
Carex tenuiflora
Carex utriculata
Carex vaginata
Carex vaginata
Carex vaginata
Cyperus eragrostis
Cyperus eragrostis
Cyperus eragrostis
Epilobium latifolium
Eriophorum chamissonis
INVALID PARENTHESES:  1954)
Fimbristylis autumnalis
Fimbristylis autumnalis
Galium triflorum
Galium triflorum
Hedysarum alpinum
Hedysarum alpinum
Juncus marginatus
Juncus triglumis
Juncus triglumis
Juncus triglumis
Juncus triglumis
Koenigia islandica
Koenigia islandica
Koenigia islandica
Koenigia islandica


Error parsing range 2 (-3) in Styles 2 (-3), very short, minutely capitate.


Lomatogonium rotatum
Lysimachia maritima
Myriophyllum ussuriense
INVALID PARENTHESES:  1986)
Myriophyllum ussuriense
Pedicularis labradorica
Persicaria sagittata
Argentina anserina
Rosa acicularis
INVALID PARENTHESES:  (2 mm
INVALID PARENTHESES:  (3.5-4.5 mm
Rosa acicularis
Rotala ramosior
Rotala ramosior
Rubus chamaemorus
Rubus chamaemorus
INVALID PARENTHESES:  2006)
Rubus chamaemorus
Sagittaria graminea
Sparganium eurycarpum
Vaccinium oxycoccos
Vaccinium oxycoccos
Vaccinium oxycoccos
Brachiaria mutica
Commelina diffusa
Commelina diffusa
Ludwigia octovalvis
Ludwigia octovalvis
Phyla nodiflora
Phyla nodiflora


Error parsing range the persistent calyx, separating at maturity into two, 1-seeded in Fruit ovate, c. 1.6 mm long, subcompressed, enclosed by the persistent calyx, separating at maturity into two, 1-seeded pyrenes.


Potamogeton nodosus
Frangula alnus
Populus nigra
Ipomoea littoralis
Callitriche hermaphroditica
Callitriche hermaphroditica
Callitriche palustris
Callitriche palustris
Sorghum arundinaceum
Brasenia schreberi
Coleanthus subtilis
Epilobium angustifolium
Epilobium angustifolium
Lonicera caerulea
Lysimachia thyrsiflora
Lysimachia thyrsiflora
Lysimachia thyrsiflora
Myriophyllum alterniflorum
Myriophyllum alterniflorum
Myriophyllum alterniflorum


Error parsing range 4- or 5-whorled in Submerged leaves 4- or 5-whorled, pectinate, broadly lanceolate in outline, 1-4 x 0.5-1.2 cm;


Myriophyllum sibiricum
Myriophyllum sibiricum
Poa pratensis
Poa pratensis
Poa pratensis
Primula egaliksensis
INVALID PARENTHESES:  2006)
Prunella vulgaris
INVALID PARENTHESES:  6)
Prunella vulgaris
Prunella vulgaris


Error parsing range c. 5 mm broad, ciliate-fringed, purplish or green. in Bracts broadly ovate-cordate, c. 5 mm broad, ciliate-fringed, purplish or green.


Ranunculus reptans
Ranunculus reptans
Sanguisorba officinalis
Sanguisorba officinalis
Sanguisorba officinalis
Saxifraga hirculus
Saxifraga hirculus
Saxifraga hirculus
Scheuchzeria palustris
Scheuchzeria palustris
Scheuchzeria palustris
Sparganium glomeratum
Sparganium glomeratum
Sparganium hyperboreum
Sparganium hyperboreum
Urtica dioica
Urtica dioica
Urtica dioica
Urtica dioica
Vaccinium vitis-idaea
Vaccinium vitis-idaea
Vaccinium vitis-idaea
Cardiospermum halicacabum
Cardiospermum halicacabum
Aldrovanda vesiculosa
Ottelia alismoides
Ottelia alismoides
Ottelia alismoides


Error parsing range 5.10-winged in Spathe elliptic-ovate, 2.5-4.5 (-6) cm long, 2-lobed, lobes acute, glabrous, lengthwise 5.10-winged or occasionally only ribbed, wings somewhat flat or wavy;
Error parsing range 6 (-9) in Stamens 6 (-9), filaments c. 4 mm long, glandular-hairy;


Limnophila indica
Dopatrium junceum
Alisma plantago-aquatica
Alisma plantago-aquatica
Alisma plantago-aquatica
Alisma plantago-aquatica
Blyxa aubertii
Blyxa aubertii
Blyxa aubertii
Juncus bufonius
INVALID PARENTHESES:  (2-3
Juncus bufonius
Juncus bufonius
Juncus bufonius
Lemna trisulca
Lemna trisulca
INVALID PARENTHESES:  (1
INVALID PARENTHESES:  ) 3
Lemna trisulca
Lemna trisulca
Typha latifolia
Typha latifolia
Typha latifolia
Typha latifolia
Ulex europaeus
Zannichellia palustris


Error parsing range 50 cm ' 0.2--0.6 mm in Stems to 50 cm ' 0.2--0.6 mm.
Error parsing range 50 cm ' 0.2--0.6 mm in Stems to 50 cm ' 0.2--0.6 mm.


Zannichellia palustris
Zannichellia palustris
Zannichellia palustris
Zingiber zerumbet
Zingiber zerumbet
Stictocardia tiliifolia
Fuirena umbellata
Cyperus distans
Cyperus distans
Solanum viarum
Salix pentandra
Hippophae rhamnoides
Alternanthera sessilis
Alternanthera sessilis
Alternanthera sessilis
INVALID PARENTHESES:  5 (2
Lemna aequinoctialis
Lemna aequinoctialis
Lemna aequinoctialis
Lemna minor


Error parsing range 3 (-5) in Thallus 1-8 x 0.8-6 mm, opaque, obovate or suborbicular, entire, subapiculate at point of attachment to parent thallus, nearly flat on both sides, veins 3 (-5), root single to 15 cm.


Lemna minor
Lemna minor
Lemna minor
Pistia stratiotes
INVALID PARENTHESES:  4(3
INVALID PARENTHESES:  ) 1
INVALID PARENTHESES:  2)
Pistia stratiotes
Pistia stratiotes
Ceratophyllum demersum
Ceratophyllum demersum
INVALID PARENTHESES:  ) 3.5-6 x 2-4 x 1-2.5 mm
INVALID PARENTHESES:  ) 3.5-6 x 2-4 x 1-2.5 mm
Ceratophyllum demersum
INVALID PARENTHESES:  ) 3.5-6 x 2-4 mm
INVALID PARENTHESES:  ) 0.5-14 mm
Ceratophyllum demersum
Commelina benghalensis
Commelina benghalensis
Commelina benghalensis
Ipomoea aquatica
Ipomoea aquatica
Cyperus compressus
Cyperus compressus
Cyperus compressus
Cyperus rotundus


Error parsing range (as low as -18degC). in Cyperus esculentus is able to tolerate lower air temperatures (as low as -18degC).


INVALID PARENTHESES:  1987)
Cyperus rotundus
Cyperus rotundus
Cyperus squarrosus
Cyperus squarrosus
Cyperus squarrosus
INVALID PARENTHESES:  1974)
Cyperus polystachyos
Aeschynomene indica
Aeschynomene indica
Myriophyllum spicatum
Myriophyllum spicatum
INVALID PARENTHESES:  (3
INVALID PARENTHESES:  5)
INVALID PARENTHESES:  (1919
INVALID PARENTHESES:  1947)
Myriophyllum spicatum
INVALID PARENTHESES:  5)
Myriophyllum spicatum
Najas marina
Najas marina
Najas marina
Najas marina
Vallisneria spiralis
Spirodela polyrhiza
Spirodela polyrhiza
Potamogeton crispus
Potamogeton crispus
Potamogeton crispus
Potamogeton pusillus
Potamogeton pusillus
Potamogeton pusillus
Potamogeton pusillus
Stuckenia pectinata
Stuckenia pectinata
Trapa natans
Trapa natans
Trapa natans
Typha domingensis
Typha domingensis
Typha domingensis
Crotalaria lanceolata
Eclipta prostrata
Epilobium ciliatum
INVALID PARENTHESES:  (1.5-3 mm
Epilobium ciliatum
Epilobium ciliatum
Kyllinga brevifolia
Kyllinga brevifolia
Kyllinga brevi

Error parsing range two-thirds in lower glume lanceolate, 1.2-2 mm long, two-thirds to four-fifths as long as the spikelet (variable even in the same panicle), acute;


Sesuvium portulacastrum
Sesuvium portulacastrum
Canavalia rosea
Cardiospermum microcarpum
Triumfetta semitriloba
Triumfetta semitriloba
Eugenia uniflora
INVALID PARENTHESES:  (1
INVALID PARENTHESES:  ) 2-6-
Eugenia uniflora


In [42]:
# train, test = train_test_split(train_data,test_size=0.2)


# split_data = {
#     'train': train,
#     'test': test
# }

# for t, data in split_data.items():
    
#     print(f'Output {len(data)} {t} records')

    
#     print(file_path)

file_path = DATA_DIR / 'assets' / f'train_numeric.json'
with file_path.open('w') as f:
    f.write(json.dumps(train_data))    

In [8]:
len(train_data)

6843

In [79]:
sent, entities = sent_entities[0]

# ents = [Span(sent.as_doc(), start, end, label=string_id)]   

# [Span(sent.as_doc(), start, end, label=label) for start, end, label in entities['entities']]

# for t in sent:
#     print(t)
    
d = sent.as_doc()

# type(d.text[10])

print(entities['entities'])
print(sent)

# print(entities)

    
# Should we be using word for entities?????     

# displacy.render(sent, style="ent")

entities

for token in sent:
    print(dir(token))
    # print(token)
    # print(token.i)
    # print(token.idx)
    # print('-')
    
    # break



[(12, 20, 'MEASUREMENT')]
Pseudostems 20-40 cm.
['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_dep', 'has_extension', 'has_head', 'has_morph', 'has_vector', 'head', 'i', 'idx', 'iob_strings', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_end', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma'

In [42]:
entities

[('Pseudostems 20-40 cm.', {'entities': [(12, 20, 'MEASUREMENT')]}),
 ('Leaves sessile except for 2 apical ones;',
  {'entities': [(26, 27, 'CARDINAL')]}),
 ('ligule brownish, 1-5 cm, sparsely appressed villous;',
  {'entities': [(17, 23, 'MEASUREMENT')]}),
 ('leaf blade obovate or oblong, 15-27 x 8-10 cm, adaxially glabrous, abaxially appressed yellow villous along midvein, base cuneate, apex acute.',
  {'entities': [(30, 45, 'DIMENSION')]}),
 ('Spikes 8-14 x 4-6 cm;', {'entities': [(7, 20, 'DIMENSION')]}),
 ('bracts rusty, ovate-lanceolate, 3-4.6 x 1.2-2 cm, membranous, appressed yellow villous, 3-flowered;',
  {'entities': [(32, 48, 'DIMENSION'), (88, 98, 'NUMERIC_CADJ')]}),
 ('bracteoles tubular, 2.7-3 cm, appressed yellow villous.',
  {'entities': [(20, 28, 'MEASUREMENT')]}),
 ('Calyx brownish, 3.3-3.5 cm, slightly split on 1 side, appressed yellow villous.',
  {'entities': [(16, 26, 'MEASUREMENT'), (46, 47, 'CARDINAL')]}),
 ('Corolla tube ca. 4 cm, glabrous;', {'entities': [(13, 

3-flowered
2-lobed
2-cleft


In [14]:


        

parser = EntityParser()
parser(doc)

print(parser.entities)
        
    

    # print(sent._.dimensions)

Rhizomes creeping.
[]
Pseudostems 20-40 cm.
[]
Leaves sessile except for 2 apical ones;
[]
ligule brownish, 1-5 cm, sparsely appressed villous;
[]
leaf blade obovate or oblong, 15-27 x 8-10 cm, adaxially glabrous, abaxially appressed yellow villous along midvein, base cuneate, apex acute.
[]
Spikes 8-14 x 4-6 cm;
[]
rachis appressed yellow villous;
[]
bracts rusty, ovate-lanceolate, 3-4.6 x 1.2-2 cm, membranous, appressed yellow villous, 3-flowered;
3-flowered
[]
bracteoles tubular, 2.7-3 cm, appressed yellow villous.
[]
Flowers white, slightly fragrant.
[]
Calyx brownish, 3.3-3.5 cm, slightly split on 1 side, appressed yellow villous.
[]
Corolla tube ca. 4 cm, glabrous;
[]
lobes linear, 2.5-3 cm x 3-4 mm.
[]
Lateral staminodes oblanceolate, ca. 2.3 cm x 7 mm.
[]
Labellum broadly ovate, ca. 2.2 x 1.9 cm, apically slightly 2-lobed or 2-cleft for ca. 0.5 its length.
2-lobed
2-cleft
[ca. 0.5 its length]
Filament white, equaling labellum;
[]
anther ca. 9 mm.
[]
Ovary ca. 5 mm, appressed ye

In [54]:
import re

In [18]:
df = pd.read_csv('/Users/ben/Projects/NaturalHistoryMuseum/Traits/traits/traits/corpus/preprocessed-descriptions.csv')

In [18]:
round(0.3,2)

0.3

In [17]:
re_num_cadj = re.compile(r'[0-9]+\-[a-z]+')

text = 'Capsules globose, 2-3 mm in diameter. , (3 or)4-valved, slightly exserted from floral tube.'

for m in re_num_cadj.finditer(text):
    print(m)
    

<re.Match object; span=(46, 54), match='4-valved'>


In [18]:
text[46:54]

'4-valved'

In [17]:
s = "{'entities': [[44, 50, 'MEASUREMENT']]}"

In [18]:
m = re.findall(r"[A-Z]{2,}", s)
label = m[0] if len(m) == 1 else 'MULTI'
label

'MEASUREMENT'

In [45]:
p = DATA_DIR / 'assets' / 'train_numeric.json'

df = pd.read_json(p)
df.columns = ['sent', 'entities']
re_upper_case = re.compile(r"[A-Z]{2,}")

def _get_label(entities):
    labels = {e[2] for e in entities['entities']}
    return next(iter(labels)) if len(labels) == 1 else 'MULTI'

df['label'] = df['entities'].apply(_get_label)

In [46]:
df['label'].value_counts()

MEASUREMENT     3686
CARDINAL        1164
MULTI            978
DIMENSION        831
NUMERIC_CADJ     184
Name: label, dtype: int64

In [52]:
train, test = train_test_split(df, test_size=0.3, random_state=0, stratify=df[['label']])

In [55]:
test, val = train_test_split(test, test_size=0.5, random_state=0, stratify=test[['label']])

In [58]:
df['label'].value_counts()

MEASUREMENT     3686
CARDINAL        1164
MULTI            978
DIMENSION        831
NUMERIC_CADJ     184
Name: label, dtype: int64

In [61]:
for sent, annot in df[['sent', 'entities']].values:
    print(annot)

{'entities': [[55, 68, 'MEASUREMENT']]}
{'entities': [[25, 30, 'MEASUREMENT']]}
{'entities': [[64, 76, 'MEASUREMENT']]}
{'entities': [[9, 12, 'CARDINAL'], [54, 55, 'CARDINAL']]}
{'entities': [[8, 9, 'CARDINAL']]}
{'entities': [[8, 18, 'MEASUREMENT']]}
{'entities': [[8, 9, 'CARDINAL']]}
{'entities': [[31, 42, 'MEASUREMENT']]}
{'entities': [[31, 41, 'MEASUREMENT']]}
{'entities': [[4, 19, 'MEASUREMENT']]}
{'entities': [[24, 34, 'MEASUREMENT'], [103, 111, 'MEASUREMENT'], [140, 147, 'MEASUREMENT']]}
{'entities': [[48, 52, 'CARDINAL'], [76, 96, 'DIMENSION']]}
{'entities': [[31, 44, 'DIMENSION']]}
{'entities': [[24, 27, 'CARDINAL'], [41, 44, 'CARDINAL']]}
{'entities': [[19, 22, 'CARDINAL']]}
{'entities': [[40, 46, 'CARDINAL'], [48, 58, 'MEASUREMENT'], [144, 160, 'DIMENSION']]}
{'entities': [[27, 32, 'CARDINAL'], [156, 157, 'CARDINAL']]}
{'entities': [[8, 9, 'CARDINAL']]}
{'entities': [[36, 46, 'MEASUREMENT']]}
{'entities': [[68, 74, 'MEASUREMENT']]}
{'entities': [[14, 20, 'MEASUREMENT']]}
{'e