In [1]:
import spacy

import numpy as np
from adept.components.registry import ComponentsRegistry
from adept.components.sentencizer import Sentencizer
from adept.components.numeric import (NumericDimension, NumericExpand, NumericFraction, NumericMeasurement, NumericRange)
from adept.components.anatomical import AnatomicalEntity
from adept.components.traits import DiscreteTraitsEntity
from adept.components.traits import CustomTraitsEntity
from adept.preprocess import Preprocess
from adept.postprocess import Postproccess
from adept.traits import Traits
from adept.fields import Fields, Field
from adept.pipeline import Pipeline

from adept.config import RAW_DATA_DIR

from abc import ABC, abstractmethod
from collections import OrderedDict
from pathlib import Path
from spacy.tokens import Span
import re
import yaml

from adept.config import unit_registry, logger
from adept.utils.helpers import flatten_dict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("en_core_web_trf")
        
registry = ComponentsRegistry(nlp)
registry.add_components([
    Sentencizer,
    AnatomicalEntity,
    DiscreteTraitsEntity,
    CustomTraitsEntity,
    NumericExpand,
    NumericDimension,
    NumericMeasurement,
    NumericRange,
    NumericFraction,
])

INIT custom_sentencizer
INIT custom_traits_entity
INIT numeric_expand
INIT numeric_dimensions
INIT numeric_measurements
INIT numeric_range
INIT numeric_fraction


In [3]:
preprocess = Preprocess()

In [18]:
# text = 'Pseudostems to 3 m. Ligule ovate, entire, ca. 1 cm, margin minutely white hairy. Leaf blade narrowly lanceolate, 30--40 × 4--5 cm, glabrous, margin denticulate near acuminate apex. Panicles 10--20 cm, densely flowered; branches 2 or 3, 0.9--7 cm; bracts minute, ca. 7 mm; bracteoles absent. Pedicel 2--3 mm, pubescent. Calyx tubular, ca. 8 mm, obtusely ribbed, glabrous, apex 3-lobed. Corolla tube ca. 1 cm; central lobe oblong, ca. 7 × 3.5 mm, apex hooked, minutely mucronate; lateral lobes obovate-oblong, ca. 5 mm. Lateral staminodes 2, at base of labellum. Labellum nearly fan-shaped, ca. 1 cm; claw short; limb subreniform, 4-lobed. Filament linear, short, glabrous; anther oblong, short; connective crestless. Capsule ca. 6 mm in diam., glabrous.'

# text = 'Trees, to 10 m tall, androdioecious. Bark gray-brown or dark brown, rough; branchlets slender, glabrous, those of present year purple-green or green, older ones gray-green or yellow-green. Winter buds purplish green, ovoid. Leaves persistent; petiole purple-green, 2-3.5 cm, glabrous; leaf blade abaxially gray-white, adaxially dark green and lucid, oblong-elliptic or elliptic, 10-14 × 3-6 cm, papery or subleathery, glabrous, pinnatinerved, lateral veins 8-12 pairs, base obtuse, apex caudate-acuminate, acumen 2-2.4 cm. Inflorescences cymose-paniculate, ca. 1.5 cm, pubescent. Sepals 5, yellowish green, ca. 2 × 7 mm. Petals 5, light yellow, ca. 1.8 × 0.4 mm. Stamens 8. Disk extrastaminal. Ovary purplish, villous. Fruit yellowish brown; nutlets flat, 1-2 cm × 6-7 mm; wing falcate, including nutlet 2-4 cm, wings spreading acutely. Fruit peduncle 1-2 cm, very slender. Fl. Aug-Oct, fr. Dec-Feb.'


text = 'Rhizomes creeping. Pseudostems 20--40 cm. Leaves sessile except for 2 apical ones; ligule brownish, 1--5 cm, sparsely appressed villous; leaf blade obovate or oblong, 15--27 × 8--10 cm, adaxially glabrous, abaxially appressed yellow villous along midvein, base cuneate, apex acute. Spikes 8--14 × 4--6 cm; rachis appressed yellow villous; bracts rusty, ovate-lanceolate, 3--4.6 × 1.2--2 cm, membranous, appressed yellow villous, 3-flowered; bracteoles tubular, 2.7--3 cm, appressed yellow villous. Flowers white, slightly fragrant. Calyx brownish, 3.3--3.5 cm, slightly split on 1 side, appressed yellow villous. Corolla tube ca. 4 cm, glabrous; lobes linear, 2.5--3 cm × 3--4 mm. Lateral staminodes oblanceolate, ca. 2.3 cm × 7 mm. Labellum broadly ovate, ca. 2.2 × 1.9 cm, apically slightly 2-lobed or 2-cleft for ca. 1/2 its length. Filament white, equaling labellum; anther ca. 9 mm. Ovary ca. 5 mm, appressed yellow villous. Stigma ciliate. Fl. Feb.'

In [19]:
text = preprocess(text)          
doc = nlp(text)
traits = Traits()

In [20]:
for sent in doc.sents:
    print(sent)
    print(sent._.measurements)

Rhizomes creeping.
[]
Pseudostems 20-40 cm.
[20-40 cm]
Leaves sessile except for 2 apical ones;
[]
ligule brownish, 1-5 cm, sparsely appressed villous;
[1-5 cm]
leaf blade obovate or oblong, 15-27 x 8-10 cm, adaxially glabrous, abaxially appressed yellow villous along midvein, base cuneate, apex acute.
[8-10 cm]
Spikes 8-14 x 4-6 cm;
[4-6 cm]
rachis appressed yellow villous;
[]
bracts rusty, ovate-lanceolate, 3-4.6 x 1.2-2 cm, membranous, appressed yellow villous, 3-flowered;
[1.2-2 cm]
bracteoles tubular, 2.7-3 cm, appressed yellow villous.
[2.7-3 cm]
Flowers white, slightly fragrant.
[]
Calyx brownish, 3.3-3.5 cm, slightly split on 1 side, appressed yellow villous.
[3.3-3.5 cm]
Corolla tube ca. 4 cm, glabrous;
[ca. 4 cm]
lobes linear, 2.5-3 cm x 3-4 mm.
[2.5-3 cm, 3-4 mm]
Lateral staminodes oblanceolate, ca. 2.3 cm x 7 mm.
[ca. 2.3 cm, 7 mm]
Labellum broadly ovate, ca. 2.2 x 1.9 cm, apically slightly 2-lobed or 2-cleft for ca. 1/2 its length.
[1.9 cm]
Filament white, equaling labellu

In [21]:
df = traits.get_discrete_traits('angiosperm')

In [22]:
fields = Fields()


for sent in doc.sents:
    
    part = sent._.anatomical_part if sent._.anatomical_part else None
    
    colour_ents = [ent for ent in sent.ents if ent.label_ == 'COLOUR']
    
    print(sent)
    print(part)
    print(colour_ents)
    
    # print(part)

#     if part !=  'calyx': continue
    
#     print(sent)
    
    # if sent._.dimensions:     
    #     field_name = f'{part} measurement'
    #     field_type = 'dimension'
    #     span = sent._.dimensions[0]
    # elif sent._.measurements:            
    #     field_name = f'{part} measurement'  
    #     field_type = 'measurement'
    #     span = sent._.measurements
    # elif sent._.volume_measurements:
    #     field_name = f'{part} volume'
    #     field_type = 'volume'
    #     span = sent._.volume_measurements[0]
    # else:
    #     continue
        
    # print(field_name)
        
    # fields.upsert(field_name, field_type, span)
        
        


class MeasurementField(Field):
    
    # Length/height measurements are provided first, followed by width. 
    dimension_axes = ['y', 'x']    
    unique = True
    num_re = re.compile(r'[\d\.]+')

    def set_value(self, measurements):        
        if self.value: 
            logger.error(f'Field {self.name} already has a value')
            return
        self._set_value(measurements)
        
    def get_value(self, axis=None, minmax=None, unit=None):
        if unit:
            unit = unit_registry(unit)
        
        data = {}
        # Build a dict of the values x.min       
        for value_axis, value_dict in self.value.items():
            data.setdefault(value_axis, {})
            for mm, value in value_dict.items():
                if unit:
                    value = self.convert_value(value, unit)
                data[value_axis][mm] = self._to_string(value)

        # If axis / minmax set, filter the data
        # FIXME: If minmax set, and not axis, this will fail
        try:         
            if axis: data = data[axis]
            if minmax: data = data[minmax]             
        except KeyError:
            # We can ignore this: measurements that have just length will not contain x axis
            return None       
        return data
    
    @staticmethod 
    def _to_string(value):
        # Convert value to string - uses the default formatting set in adept.config unit_registry.default_format  
        return f'{value}'

    @staticmethod 
    def convert_value(value, unit):
        return value.to(unit)
        
    def _set_value(self, measurements):  
        # If we have two measurements, treat them as y, x         
        if len(measurements) == 2:
            for axis, measurement in zip(self.dimension_axes, measurements):
                self._set_axis_value(axis, measurement, measurement._.measurement_unit)
        elif len(measurements) == 1:
            measurement = measurements[0]
            self._set_axis_value(self.dimension_axes[0], measurement, measurement._.measurement_unit)
        
    def _set_axis_value(self, axis, measurement: Span, unit):
        if value := self._get_minmax_value(measurement, unit):
            self.value[axis] = value

    def _get_minmax_value(self, measurement: Span, unit):
        # Some measurements are detected, but have no unit. 
        # E.g. Petals white, suborbicular, 6-7 x 5-6.
        # No unit = do not use the measurement        
        if not unit: return
        value_dict = self._get_ent_value(measurement)
        
        unpack = lambda ks: ([v for k in ks if (v := value_dict.get(k))])

        return {
            'min': self._to_unit(min(unpack(['lower', 'from']), default=None), unit),
            'max': self._to_unit(max(unpack(['to', 'upper']), default=None), unit)
        }
        
    @staticmethod
    def _to_unit(value, unit):
        if value:
            return float(value) * unit
    
    def _get_ent_value(self, ent: Span):
        if ent._.numeric_range:
            value = ent._.numeric_range
        else:
            # Extract numerical parts from the ent string             
            nums = [float(m) for m in self.num_re.findall(ent.text)]
            value = {'from': min(nums, default=None), 'to': max(nums, default=None)} 
   
        return value          
        

# print(fields._fields['plant measurement'])


template_path = RAW_DATA_DIR / 'fields.tpl.yml'
fields.to_dict()
# fields.to_template(template_path)

    

Rhizomes creeping.
rhizome
[]
Pseudostems 20-40 cm.
None
[]
Leaves sessile except for 2 apical ones;
leaf
[]
ligule brownish, 1-5 cm, sparsely appressed villous;
ligule
[brownish]
leaf blade obovate or oblong, 15-27 x 8-10 cm, adaxially glabrous, abaxially appressed yellow villous along midvein, base cuneate, apex acute.
leaf
[yellow]
Spikes 8-14 x 4-6 cm;
spike
[]
rachis appressed yellow villous;
rachi
[yellow]
bracts rusty, ovate-lanceolate, 3-4.6 x 1.2-2 cm, membranous, appressed yellow villous, 3-flowered;
bract
[yellow]
bracteoles tubular, 2.7-3 cm, appressed yellow villous.
None
[yellow]
Flowers white, slightly fragrant.
inflorescence
[white]
Calyx brownish, 3.3-3.5 cm, slightly split on 1 side, appressed yellow villous.
calyx
[brownish, yellow]
Corolla tube ca. 4 cm, glabrous;
corolla
[]
lobes linear, 2.5-3 cm x 3-4 mm.
lobe
[]
Lateral staminodes oblanceolate, ca. 2.3 cm x 7 mm.
None
[]
Labellum broadly ovate, ca. 2.2 x 1.9 cm, apically slightly 2-lobed or 2-cleft for ca. 1/2 it

OrderedDict()

In [2]:
pipeline = Pipeline()

INIT custom_sentencizer
INIT custom_traits_entity
INIT numeric_expand
INIT numeric_dimensions
INIT numeric_measurements
INIT numeric_range
INIT numeric_fraction


In [6]:
# text = "Trees, to 10 m tall, androdioecious. Bark gray-brown or dark brown, rough; branchlets slender, glabrous, those of present year purple-green or green, older ones gray-green or yellow-green. Winter buds purplish green, ovoid. Leaves persistent; petiole purple-green, 2-3.5 cm, glabrous; leaf blade abaxially gray-white, adaxially dark green and lucid, oblong-elliptic or elliptic, 10-14 × 3-6 cm, papery or subleathery, glabrous, pinnatinerved, lateral veins 8-12 pairs, base obtuse, apex caudate-acuminate, acumen 2-2.4 cm. Inflorescences cymose-paniculate, ca. 1.5 cm, pubescent. Sepals 5, yellowish green, ca. 2 × 7 mm. Petals 5, light yellow, ca. 1.8 × 0.4 mm. Stamens 8. Disk extrastaminal. Ovary purplish, villous. Fruit yellowish brown; nutlets flat, 1-2 cm × 6-7 mm; wing falcate, including nutlet 2-4 cm, wings spreading acutely. Fruit peduncle 1-2 cm, very slender. Fl. Aug-Oct, fr. Dec-Feb."
text = 'Pseudostems to 3 m. Ligule ovate, entire, ca. 1 cm, margin minutely white hairy. Leaf blade narrowly lanceolate, 30--40 × 4--5 cm, glabrous, margin denticulate near acuminate apex. Panicles 10--20 cm, densely flowered; branches 2 or 3, 0.9--7 cm; bracts minute, ca. 7 mm; bracteoles absent. Pedicel 2--3 mm, pubescent. Calyx tubular, ca. 8 mm, obtusely ribbed, glabrous, apex 3-lobed. Corolla tube ca. 1 cm; central lobe oblong, ca. 7 × 3.5 mm, apex hooked, minutely mucronate; lateral lobes obovate-oblong, ca. 5 mm. Lateral staminodes 2, at base of labellum. Labellum nearly fan-shaped, ca. 1 cm; claw short; limb subreniform, 4-lobed. Filament linear, short, glabrous; anther oblong, short; connective crestless. Capsule ca. 6 mm in diam., glabrous.'
fields = pipeline(text, 'angiosperm') 

TOKEN:  8
ENT:  8 mm
8 mm
HEYEYEY
[4-5 cm, 10-20 cm, 0.9-7 cm, 2-3 mm, ca. 1 cm, 3.5 mm, ca. 5 mm, ca. 6 mm]


Field lobe measurement already has a value


In [5]:
text = 'Pseudostems to 3 m. Ligule ovate, entire, ca. 1 cm, margin minutely white hairy. Leaf blade narrowly lanceolate, 30--40 × 4--5 cm, glabrous, margin denticulate near acuminate apex. Panicles 10--20 cm, densely flowered; branches 2 or 3, 0.9--7 cm; bracts minute, ca. 7 mm; bracteoles absent. Pedicel 2--3 mm, pubescent. Calyx tubular, ca. 8 mm, obtusely ribbed, glabrous, apex 3-lobed. Corolla tube ca. 1 cm; central lobe oblong, ca. 7 × 3.5 mm, apex hooked, minutely mucronate; lateral lobes obovate-oblong, ca. 5 mm. Lateral staminodes 2, at base of labellum. Labellum nearly fan-shaped, ca. 1 cm; claw short; limb subreniform, 4-lobed. Filament linear, short, glabrous; anther oblong, short; connective crestless. Capsule ca. 6 mm in diam., glabrous.'

preprocess = Preprocess()

preprocess(text)

'Pseudostems to 3 m. Ligule ovate, entire, ca. 1 cm, margin minutely white hairy. Leaf blade narrowly lanceolate, 30-40 x 4-5 cm, glabrous, margin denticulate near acuminate apex. Panicles 10-20 cm, densely flowered; branches 2 or 3, 0.9-7 cm; bracts minute, ca. 7 mm; bracteoles absent. Pedicel 2-3 mm, pubescent. Calyx tubular, ca. 8 mm, obtusely ribbed, glabrous, apex 3-lobed. Corolla tube ca. 1 cm; central lobe oblong, ca. 7 x 3.5 mm, apex hooked, minutely mucronate; lateral lobes obovate-oblong, ca. 5 mm. Lateral staminodes 2, at base of labellum. Labellum nearly fan-shaped, ca. 1 cm; claw short; limb subreniform, 4-lobed. Filament linear, short, glabrous; anther oblong, short; connective crestless. Capsule ca. 6 mm in diameter. , glabrous.'

In [10]:
print(fields.to_dict())

OrderedDict([('indumentum', 'hairy'), ('plant colour', 'white'), ('leaf shape', 'lanceolate'), ('leaf apex', 'lanceolate, acuminate'), ('leaf margin', 'denticulate'), ('leaf measurement.y.min', '30.0 cm'), ('leaf measurement.y.max', '40.0 cm'), ('leaf measurement.x.min', '4.0 cm'), ('leaf measurement.x.max', '5.0 cm'), ('inflorescence arrangement', 'panicle'), ('panicle measurement.y.min', '10.0 cm'), ('panicle measurement.y.max', '20.0 cm'), ('habit', 'tree/shrub'), ('clonality', 'solitary plant'), ('perennial organ', 'trunk'), ('branch number', '2'), ('dispersion axillary', 'absent'), ('pedicel measurement.y.min', '2.0 mm'), ('pedicel measurement.y.max', '3.0 mm'), ('lobed number', '3-'), ('corolla measurement.y.min', '1.0 cm'), ('corolla measurement.y.max', '1.0 cm'), ('lobe measurement.y.min', '7.0 mm'), ('lobe measurement.y.max', '7.0 mm'), ('lobe measurement.x.min', '3.5 mm'), ('lobe measurement.x.max', '3.5 mm'), ('petal fusion', 'free'), ('flower symmetry', 'zygomorphic'), ('fr

In [10]:
import re
from pathlib import Path
import yaml

class FieldOutputTemplate():
    
    """
    Load output field defintions from a template file, and map the output to the field names
    We do this, so th eoutput can be prepared for AC
    """
    
    regexes = {
        'unit': re.compile('\[([a-z³]+)\]'),
        'minmax': re.compile(r'\b(min|max)\b'),
        'axis': re.compile(r'\b(x|y)\b')
    }

    def __init__(self, template_path: Path):
        with template_path.open('r') as f:
            self._tpl = yaml.full_load(f)
        
    def get_data(self, fields):
        return {src: self._get_value(src, targets, fields) for src, targets in self._tpl.items()}
        
    def _get_value(self, src, targets, fields):        
        # Template can have a list of targets - so if just a string convert to a list         
        if not isinstance(targets, list):
            targets = [targets]
            
        for target in targets:
            if value := self._get_field_value(src, target, fields):
                return value
            
    def _get_field_value(self, src, target, fields):        
        field_dict = {}
        self._re('unit', src, field_dict)
        if target:
            field_name = target.split('.')[0]
            self._re('minmax', target, field_dict)
            self._re('axis', target, field_dict)
        else:
            field_name = src
        
        if field := fields.get(field_name):
            return field.get_value(**field_dict)
            
    def _re(self, name, field_name, field_dict):        
        if match := self.regexes[name].search(field_name):
            field_dict[name] = match.group(1)
            
# template_path = RAW_DATA_DIR / 'fields.tpl.yml'            
# tpl = FieldOutputTemplate(template_path)
# tpl.get_data(fields._fields)      

print(fields.to_dict())

OrderedDict([('habit', 'tree, tree/shrub'), ('clonality', 'solitary plant'), ('perennial organ', 'trunk'), ('habitat', 'tree'), ('reproduction system', 'androdioecious'), ('plant measurement.y.min', '10.0 m'), ('plant measurement.y.max', '10.0 m'), ('indumentum', 'glabrous'), ('bud colour', 'green, purplish'), ('petiole measurement.y.min', '2.0 cm'), ('petiole measurement.y.max', '3.5 cm'), ('petiole colour', 'green, purple'), ('leaf shape', 'elliptic'), ('leaf architecture', 'pinnatipartite'), ('leaf apex', 'acuminate, obtuse, caudate'), ('leaf measurement.y.min', '10.0 cm'), ('leaf measurement.y.max', '14.0 cm'), ('leaf measurement.x.min', '3.0 cm'), ('leaf measurement.x.max', '6.0 cm'), ('leaf colour', 'green, gray-white'), ('inflorescence arrangement', 'cyme, panicle'), ('inflorescence measurement.y.min', '1.5 cm'), ('inflorescence measurement.y.max', '1.5 cm'), ('sepal measurement.y.min', '2.0 mm'), ('sepal measurement.y.max', '2.0 mm'), ('sepal measurement.x.min', '7.0 mm'), ('se

In [15]:
s = '2 m by 20.20m'

In [16]:
re.findall("(\d+(?:\.\d+)?)",s)

['2', '20.20']

In [11]:
int('2.')

ValueError: invalid literal for int() with base 10: '2.'

In [22]:
names = ['Alpinia bambusifolia', 'Alpinia galang']

In [26]:
import uuid
import pickle
# uuid.uuid5(uuid.NAMESPACE_URL, ).hex

uuid.uuid5(uuid.NAMESPACE_URL, str(pickle.dumps(names))).hex

'7c597ed5e9cf5ff19d1fac3e6f25fbc7'

In [28]:
np.array([1]).mean()

1.0

In [6]:
nlp = spacy.load("en_core_web_trf")

In [8]:
text = 'Pseudostems to 3 m. Ligule ovate, entire, ca. 1 cm, margin minutely white hairy. Leaf blade narrowly lanceolate, 30--40 × 4--5 cm, glabrous, margin denticulate near acuminate apex. Panicles 10--20 cm, densely flowered; branches 2 or 3, 0.9--7 cm; bracts minute, ca. 7 mm; bracteoles absent. Pedicel 2--3 mm, pubescent. Calyx tubular, ca. 8 mm, obtusely ribbed, glabrous, apex 3-lobed. Corolla tube ca. 1 cm; central lobe oblong, ca. 7 × 3.5 mm, apex hooked, minutely mucronate; lateral lobes obovate-oblong, ca. 5 mm. Lateral staminodes 2, at base of labellum. Labellum nearly fan-shaped, ca. 1 cm; claw short; limb subreniform, 4-lobed. Filament linear, short, glabrous; anther oblong, short; connective crestless. Capsule ca. 6 mm in diam., glabrous.'
text = preprocess(text)

In [8]:
doc = nlp(text)

NameError: name 'nlp' is not defined

In [9]:
for sent in doc.sents:
    print(sent)
    for ent in sent.ents:
        print(ent)
        print(ent.label_)

NameError: name 'doc' is not defined

In [54]:
s = 'Capsule ca 6 mm in diameter.'

re.sub('\sca\s', ' ca. ', s)

'Capsule ca. 6 mm in diameter.'

In [24]:
x = RAW_DATA_DIR / 'fields.tpl.yml'

In [25]:
x.stem

'fields.tpl'