In [10]:
import typer
import pandas as pd
import spacy
from pathlib import Path
from tqdm import tqdm
import numpy as np
import warnings
import yaml
import itertools
from collections import OrderedDict

from adept.components.registry import ComponentsRegistry
from adept.components.sentencizer import Sentencizer
from adept.components.numeric import (NumericDimension, NumericExpand, NumericFraction, NumericMeasurement, NumericRange)
from adept.components.anatomical import AnatomicalEntity
from adept.components.traits import DiscreteTraitsEntity
from adept.components.traits import CustomTraitsEntity
from adept.traits import Traits
from adept.tasks.patterns.anatomy import AnatomyPatternsTask, AnatomicalPartsTask
from adept.utils.helpers import token_get_ent
from adept.preprocess import Preprocess
from adept.config import RAW_DATA_DIR

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
nlp = spacy.load("en_core_web_trf")
     
registry = ComponentsRegistry(nlp)
registry.add_components([
    Sentencizer,
    AnatomicalEntity,
    DiscreteTraitsEntity,
    CustomTraitsEntity,
    NumericExpand,
    NumericDimension,
    NumericMeasurement,
    NumericRange,
    NumericFraction,
])

INIT custom_sentencizer
INIT custom_traits_entity
INIT numeric_expand
INIT numeric_dimensions
INIT numeric_measurements
INIT numeric_range
INIT numeric_fraction


In [5]:
# text = 'A glabrous perennial 15-30(-40) cm. Rhizomes far-creeping producing tufts of 1-3 shoots at ± regular intervals. Roots pale yellow-brown. Scales grey brown, often tinged wine red, soon becoming fibrous.           Stems trigonous, smooth below, rough above, ± decumbent at base. Lvs 12-40 x 2-4 mm, concave, often curved, thick, bluntly keeled or channelled, gradually tapering to a trigonous point up to 5 cm long, mid green, ± shiny. Ligule 1-4 mm, broadly ovate or almost truncate. Lower sheaths lfless, persistent, thick, white, tinged with red. Infl 1/5-1/4 lngth of stem. Lower bract lf-like, about as long as infl, not sheathing, upper glumaceous.  Male spike 1(-2), 10-15 x 3 mm, fusiform. Male glumes 3-4 mm, ovate, acute or obtuse, dark purple-black with hyaline margin. Female spikes 1-2(-3), ± contiguous, (5-)10-15(-20) x 4-5 mm, ovoid or subglobose, erect, lower peduncled, upper ±  sessile. Female glumes 2-3 mm, ovate, acute, dark purplish-brown with pale midrib and hyaline margin, shorter than fr.  Utricle 3-3.5 mm, smooth, ±          inflated, dark purple green in upper half, shiny. Beak 0.5 mm, ±          notched. Stigmas 2, rarely 3. Nut 2 mm, subglobose.' 
# text = "V-form Perennials, 40-100 cm tall 1 x 2 cm (usually rhizomatous, sometimes stoloniferous). Stems 1(-4), erect, yellow, 5 stamen, simple or branched, densely lanate-tomentose to glabrate. 5 ovaries. Leaves petiolate (proximally) or sessile (distally, weakly clasping and gradually reduced); blades oblong or lanceolate, 3.5-35+ cm x 5-35 mm, 1-2-pinnately lobed (ultimate lobes +- lanceolate, often arrayed in multiple planes), faces glabrate to sparsely tomentose or densely lanate. Heads 10-100+, in simple or compound, corymbiform arrays. Phyllaries 20-30 in +- 3 series, (light green, midribs dark green to yellowish, margins green to light or dark brown) ovate to lanceolate, abaxial faces tomentose. Receptacles convex; paleae lanceolate, 1.5-4 mm. Ray florets (3-)5-8, pistillate, fertile; corollas white or light pink to deep purple, laminae 1.5-3 x 1.5-3 mm. Disc florets 10-20; corollas white to grayish white, 2-4.5 mm. Cypselae 1-2 mm (margins broadly winged). 2n = 18, 27, 36, 45, 54, 63, 72 (including counts from Europe). Morphologic characters that have been used to segregate these populations into species and/or varieties include: (1) degree and persistence of tomentum; (2) phyllaries with greenish, light brown, or dark brown margins; (3) shapes of capitulescences (rounded or flat-topped); and (4) degrees of leaf dissection and shapes of lobes."

preprocess = Preprocess()

In [6]:
text = """Perennial 40-100 cm tall. Culms slender to moderately robust, erect or decumbent and rooting at lower nodes, up to 80 cm tall, nodes glabrous, eglandular or with glandular ring. Leaf sheaths shorter than internodes, glabrous except for ciliate outer margin; leaf blades narrowly lanceolate, 3–10 × 0.4–0.8 cm, glabrous, scabrid, base rounded, apex acute; ligule 1–2 mm. Panicle open, ovate in outline, 4–11 cm, glandular, many-spiculate; branches and pedicels filiform, flexuose; pedicels variable in length, shorter or longer than spikelets. Spikelets elliptic-globose, 1.5–2(–2.2) mm, greenish or purplish brown; florets slightly to clearly dissimilar; lower floret male, upper floret female; glumes subequal, as long as or shorter than florets, broadly elliptic, 5–7(–9)-veined, usually glabrous, rarely hispidulous or scaberulous above middle, apex broadly rounded; lower lemma oblong, cartilaginous to subcrustaceous, shallowly convex, back sometimes sulcate, smooth, glabrous; anthers 0.8–1.3 mm; upper lemma crustaceous, shorter and more convex, slightly rough, back glabrous or puberulous, upper margins ciliate. Fl. and fr. summer to autumn. 2n = 60.

Wet places, forming colonies, and as a weed of rice fields. Anhui, Fujian, Guangdong, Guangxi, Guizhou, Hebei, Henan, Hubei, Hunan, Jiangsu, Jiangxi, Liaoning, Shaanxi, Shandong, Taiwan, Yunnan, Zhejiang [Bangladesh, Bhutan, India, Indonesia, Japan, Korea, Malaysia, Nepal, New Guinea, Philippines, Sri Lanka, Thailand, Vietnam; Australia, Pacific Islands].

This is a widespread and very variable species. 2 ovaries. 56 stamenoids. Seed volume is about 2 cm³. It includes several ill-defined entities that have been given specific rank in the past. The typical form, from Japan, has spikelets with the florets only slightly dissimilar, nearly equal in length and texture and the upper floret rounded on the back without a central groove. This form is the most common entity in China. Specimens from India usually have more clearly unequal florets, the lower one longer and thinner, with a deep, longitudinal groove on the back. This form may have a glabrous or pubescent upper floret, and the pubescent variant is the basis of the name Isachne mili-acea. In SE Asia this division breaks down, with many intermediate forms. Specimens in which the florets are nearly equal but the lower one is grooved also occur in China. There is much variation in habit and spikelet size unrelated to other characters."""
text = preprocess(text)

text

'Perennial 40-100 cm tall. Culms slender to moderately robust, erect or decumbent and rooting at lower nodes, up to 80 cm tall, nodes glabrous, eglandular or with glandular ring. Leaf sheaths shorter than internodes, glabrous except for ciliate outer margin; leaf blades narrowly lanceolate, 3-10 x 0.4-0.8 cm, glabrous, scabrid, base rounded, apex acute; ligule 1-2 mm. Panicle open, ovate in outline, 4-11 cm, glandular, many-spiculate; branches and pedicels filiform, flexuose; pedicels variable in length, shorter or longer than spikelets. Spikelets elliptic-globose, 1.5-2(-2.2) mm, greenish or purplish brown; florets slightly to clearly dissimilar; lower floret male, upper floret female; glumes subequal, as long as or shorter than florets, broadly elliptic, 5-7(-9)-veined, usually glabrous, rarely hispidulous or scaberulous above middle, apex broadly rounded; lower lemma oblong, cartilaginous to subcrustaceous, shallowly convex, back sometimes sulcate, smooth, glabrous; anthers 0.8-1.3 

In [11]:
doc = nlp(text)

In [12]:
traits = Traits()

In [15]:
from spacy.tokens import Span, Doc
from abc import ABC, abstractmethod

df = traits.get_discrete_traits('angiosperm')

from adept.config import logger

class Field(ABC):
    
    unique = False
    
    def __init__(self, name):
        self.name = name
        self.value = {} if self.unique else set()
        
    @abstractmethod
    def set_value(self, value):
        pass
    
    def get_value(self):
        return ', '.join(self.value)
    
    def __repr__(self):
        return f'{self.__class__.__name__}({self.value})'

class MeasurementField(Field):
    
    # Length/height measurements are provided first, followed by width. 
    dimension_axes = ['y', 'x']    
    unique = True

    def set_value(self, measurements):        
        if self.value: raise Exception(f'Field {self.name} already has a value')
        self._set_value(measurements)
        
    def get_value(self, axis=None, minmax=None, unit=None):
        if unit:
            unit = unit_registry(unit)
        
        data = {}
        # Build a dict of the values x.min         
        for value_axis, value_dict in self.value.items():
            data.setdefault(value_axis, {})
            for mm, value in value_dict.items():
                if unit:
                    value = self.convert_value(value, unit)
                data[value_axis][mm] = self._to_string(value)

        # If axis / minmax set, filter the data
        # FIXME: If minmax set, and not axis, this will fail         
        if axis: data = data[axis]
        if minmax: data = data[minmax]
        return data
    
    @staticmethod 
    def _to_string(value):
        # Convert value to string - uses the default formatting set in adept.config unit_registry.default_format  
        return f'{value}'

    @staticmethod 
    def convert_value(value, unit):
        return value.to(unit)
        
    def _set_value(self, measurements):  
        # If we have two measurements, treat them as y, x         
        if len(measurements) == 2:
            for axis, measurement in zip(self.dimension_axes, measurements):
                self._set_axis_value(axis, measurement, measurement._.measurement_unit)
        elif len(measurements) == 1:
            measurement = measurements[0]
            self._set_axis_value(self.dimension_axes[0], measurement, measurement._.measurement_unit)
        
    def _set_axis_value(self, axis, measurement: Span, unit):
        if value := self._get_minmax_value(measurement, unit):
            self.value[axis] = value

    def _get_minmax_value(self, measurement: Span, unit):
        # Some measurements are detected, but have no unit. 
        # E.g. Petals white, suborbicular, 6-7 x 5-6.
        # No unit = do not use the measurement        
        if not unit: return
        value_dict = self._get_ent_value(measurement)
        unpack = lambda ks: ([v for k in ks if (v := value_dict.get(k))])
        return {
            'min': self._to_unit(min(unpack(['lower', 'from']), default=None), unit),
            'max': self._to_unit(max(unpack(['to', 'upper']), default=None), unit)
        }
        
    @staticmethod
    def _to_unit(value, unit):
        if value:
            return float(value) * unit
    
    @staticmethod
    def _get_ent_value(ent: Span):
        if ent._.numeric_range:
            value = ent._.numeric_range
        else:
            # Also validate shape is d, dd, so cast to int won't fail
            num = [int(token.text) for token in ent if token.pos_ == 'NUM' and set(token.shape_) == set('d')]
            value = {'from': min(num, default=None), 'to': max(num, default=None)} 
   
        return value  
      
class DimensionField(MeasurementField):
    
    def _set_value(self, dimension):        
        for i, axis in enumerate(self.dimension_axes):                    
             # 0 => 1; 1 => 0
            adj_i = (i-1)**2
            ent = dimension.ents[i]
            # Sometimes the unit is only attached to one of the dimensions e.g. 1.5-2 x 1.7-2.2 cm             
            unit = ent._.measurement_unit or dimension.ents[adj_i]._.measurement_unit            
            self._set_axis_value(axis, ent, unit)  

class VolumeField(MeasurementField):   
    def _set_value(self, volume):          
        self.value = self._get_minmax_value(volume, volume._.measurement_unit)

    def get_value(self, minmax=None, unit=None):
        if unit:
            unit = unit_registry(unit)        
        data = {}        
        for mm, value in self.value.items():
            if unit:
                value = self.convert_value(value, unit)  
            data[mm] = self._to_string(value)
        if minmax: data = data[minmax]
        return data    
    
            
class DiscreteField(Field):
    def set_value(self, value):
        self.value.add(value)

class NumericField(Field):
    def set_value(self, num_ent: Span):
        self.value = num_ent._.get("numeric_range") or num_ent.text
    def get_value(self):
        return self.value
        
class Fields(object):
    
    _classes = {
        'discrete': DiscreteField,
        'measurement': MeasurementField,
        'dimension': DimensionField,
        'numeric': NumericField,
        'volume': VolumeField,
    }    
    
    def __init__(self):
        self._fields = OrderedDict()
        
    def upsert(self, field_name, field_type, value):
        # If we do not already have the field defined create it         
        if not field_name in self._fields: 
            self._fields[field_name] = self._factory(field_type, field_name)

        self._fields[field_name].set_value(value)
            
    def _factory(self, field_type, field_name):
        return self._classes[field_type](field_name)  
    
    def to_dict(self):
        data = OrderedDict()
        for field in self._fields.values():    
            value = field.get_value()
            value_dict = {field.name: value}
            if isinstance(value, dict):
                # We want to turn measurement dicts etc into single dimension dicts e.g.
                # {'field_name': {'y': {'min': 40.0, 'max': 100.0}}} => {'field_name.y.min': 40.0, 'field_name.y.max': 100.0}
                value_dict = flatten_dict(value_dict)
            data.update(value_dict)   
        return data    

fields = Fields()   
    
for sent in doc.sents:
    
    part = sent._.anatomical_part if sent._.anatomical_part else None
    trait_entities = [ent for ent in sent.ents if ent.label_ == 'TRAIT']    
    colour_entities = [ent for ent in sent.ents if ent.label_ == 'COLOUR']
    
    for colour in colour_entities:
        # Just use a discrete field         
        fields.upsert(f'{part}_colour', 'discrete', colour.lemma_)

    # Process entities     
    for ent in trait_entities:                
        mask = ((df.term == ent.lemma_) | (df.term == ent.text))
        # If the trait ent is a part, no need to filter on part         
        if part != ent.lemma_:
            # Plant part is an artificial construct, so we treate it as sent having no part             
            if part and part != 'plant':
                mask &= (df.part == part)
            # If sent has no part, trait must have no part and must be unique             
            else:
                mask &= ((df.part.isna()) & (df.unique == True))
            
        rows = df[mask]     
                        
        for row in rows.itertuples():   
            fields.upsert(row.trait, 'discrete', row.character)
            
    for ent in sent.ents:
        if trait_value := ent._.get("trait_value"):
            fields.upsert(ent.label_.lower(), 'discrete', trait_value)

    # We require a part for dimensions & measurements     
    if part:
        if sent._.dimensions:     
            field_name = f'{part}_measurement'
            fields.upsert(field_name, 'dimension', sent._.dimensions[0])
        elif sent._.measurements:            
            field_name = f'{part}_measurement'
            fields.upsert(field_name, 'measurement', sent._.measurements)
        elif sent._.volume_measurements:
            field_name = f'{part}_volume'
            fields.upsert(field_name, 'volume', sent._.volume_measurements[0])

    # Process numeric values (those not measurements or dimensions)     
    numeric_ents = [ent for ent in sent.ents if ent.label_ in ['CARDINAL', 'QUANTITY'] and not (ent[0]._.is_measurement or ent[0]._.is_dimension)]

    # We use the dependency parse to find nummod noun, that's also an entity     
    for num_ent in numeric_ents:
        root = num_ent.root
        if root.dep_ == 'nummod' and root.head.pos_ == 'NOUN':
            if ent:= token_get_ent(root.head, ['PART', 'TRAIT']): 
                field_name = ent._.get("anatomical_part") or ent.lemma_
                fields.upsert(f'{field_name}_number', 'numeric', num_ent)
                


# fields._fields



data = OrderedDict()
for field in fields._fields.values():    
    value = field.get_value()
    value_dict = {field.name: value}
    if isinstance(value, dict):
        # We want to turn measurement dicts etc into single dimension dicts e.g.
        # {'field_name': {'y': {'min': 40.0, 'max': 100.0}}} => {'field_name.y.min': 40.0, 'field_name.y.max': 100.0}
        value_dict = flatten_dict(value_dict)
    data.update(value_dict)
    


data

OrderedDict([('life cycle', 'perennial'),
             ('plant_measurement.y.min', '40.0 cm'),
             ('plant_measurement.y.max', '100.0 cm'),
             ('habit', 'tree/shrub, erect leafy/tussock'),
             ('clonality', 'solitary plant, rhizome'),
             ('perennial organ', 'trunk, rhizome'),
             ('indumentum', 'glabrous, pubescent, glandular'),
             ('leaf shape', 'orbicular, lanceolate'),
             ('leaf base', 'rounded'),
             ('leaf apex', 'acute, rounded'),
             ('leaf_measurement.y.min', '3.0 cm'),
             ('leaf_measurement.y.max', '10.0 cm'),
             ('leaf_measurement.x.min', '0.4 cm'),
             ('leaf_measurement.x.max', '0.8 cm'),
             ('ligule_measurement.y.min', '1.0 mm'),
             ('ligule_measurement.y.max', '2.0 mm'),
             ('inflorescence arrangement', 'spiklet, panicle'),
             ('panicle_measurement.y.min', '4.0 cm'),
             ('panicle_measurement.y.max', '11.0 cm'),

In [48]:
import re
from adept.config import unit_registry




class FieldOutputTemplate():
    
    """
    Load output field defintions from a template file, and map the output to the field names
    """
    
    regexes = {
        'unit': re.compile('\[([a-z³]+)\]'),
        'minmax': re.compile(r'\b(min|max)\b'),
        'axis': re.compile(r'\b(x|y)\b')
    }

    def __init__(self, template_path: Path):
        self._mapper = yaml.full_load(template_path.open('r'))
        
    def get_data(self, fields):
        return {src: self._get_value(src, targets, fields) for src, targets in self._mapper.items()}
        
    def _get_value(self, src, targets, fields):        
        # Template can have a list of targets - so if just a string convert to a list         
        if not isinstance(targets, list):
            targets = [targets]
            
        for target in targets:
            if value := self._get_field_value(src, target, fields):
                return value
            
    def _get_field_value(self, src, target, fields):        
        field_dict = {}
        self._re('unit', src, field_dict)
        if target:
            field_name = target.split('.')[0]
            self._re('minmax', target, field_dict)
            self._re('axis', target, field_dict)
        else:
            field_name = src

        if field := fields.get(field_name):
            return field.get_value(**field_dict)
            
    def _re(self, name, field_name, field_dict):        
        if match := self.regexes[name].search(field_name):
            field_dict[name] = match.group(1)

mapper = FieldsMapper(RAW_DATA_DIR / 'fields.tpl.yml')
data = mapper.get_data(fields._fields)        
        


# def get_unit(field_name):    
#     # If field name has a unit specified (e.g. leaf min. width [cm])      
#     if match := re.search('\[([a-z³]+)\]', field_name):
#         unit = match.group(1)
#         return unit_registry(unit)    
    
# # Specify axis // min/max    

# axes = {'x', 'y'}
# min_max = {'min', 'max'}
# for src, target in fields_tpl.items():
    
    
#     unit = get_unit(src)
#     print(unit)
    
#     if target:
#         target_parts = target.split('.')
#         field_name = target_parts.pop(0)
        # axis = axes.intersection(set(target_parts))
        # print(axis)
        # target_parts[1:]
        # axis = ax
        
        
#         field_params['']
#         print(target_parts[1:])
#         # field_def |= target_parts[1:]
#     else:
#         field_name = src
#         field_def = []
            
#     # # FIXME: Get? Without create??     
#     field = fields._fields.get(field_name)
#     if field:
#         print(field_name)
#         print(field)
#         print(field_def)
    
    # print(field_name)
            
        
        
        
        
        # print(src)
        # print(target.split('.'))
    
# print(fields._fields.keys())

In [52]:

    
# FIXME: seed_volume.min

{'life cycle': 'perennial',
 'plant_measurement.y.min': 40.0,
 'plant_measurement.y.max': 100.0,
 'habit': 'erect leafy/tussock, tree/shrub',
 'clonality': 'solitary plant, rhizome',
 'perennial organ': 'trunk, rhizome',
 'indumentum': 'pubescent, glandular, glabrous',
 'leaf shape': 'lanceolate, orbicular',
 'leaf base': 'rounded',
 'leaf apex': 'rounded, acute',
 'leaf_measurement.y.min': 3.0,
 'leaf_measurement.y.max': 10.0,
 'leaf_measurement.x.min': 0.4,
 'leaf_measurement.x.max': 0.8,
 'ligule_measurement.y.min': 1.0,
 'ligule_measurement.y.max': 2.0,
 'inflorescence arrangement': 'panicle, spiklet',
 'panicle_measurement.y.min': 4.0,
 'panicle_measurement.y.max': 11.0,
 'spikelet_colour': 'greenish, brown, purplish',
 'petal fusion': 'no perianth',
 'spikelet_measurement.y.min': 1.5,
 'spikelet_measurement.y.max': 2.2,
 'flower architecture': 'naked',
 'flower symmetry': 'no perianth',
 'anther_measurement.y.min': 0.8,
 'anther_measurement.y.max': 1.3,
 'ploidy level (2n)': '2n 

In [13]:
from spacy import displacy


for sent in doc.sents:
    
    # Process numeric values (those not measurements or dimensions)     
    numeric_ents = [ent for ent in sent.ents if ent.label_ in ['CARDINAL', 'QUANTITY'] and not (ent[0]._.is_measurement or ent[0]._.is_dimension)]
    
    # We use the dependency parse to find nummod noun, that's also an entity     
    for num in numeric_ents:
        root = num.root
        if root.dep_ == 'nummod' and root.head.pos_ == 'NOUN':
            if ent:= token_get_ent(root.head, ['PART', 'TRAIT']): 
                print(num)
                print(num._.get("numeric_range"))
                # field_name = f'{ent.text}_num'
                # fields.upsert(field_name, 'numeric', num)

        # print()
        
    
#     print(sent)
#     print(cardinals)
    
#     for ent in sent.ents:
#         print(ent.label_)
        
    
#     # displacy.render(sent, style="dep")

#     for cardinal in cardinals:
        
#         # if cardinal.text == ''
        
#         root = cardinal.root
#         print(root)
        
#         if root.dep_ == 'nummod':
#             nummod_subj = root.head
#             part = token_get_ent(nummod_subj, 'PART')
#             print(nummod_subj)
#             print(part)
    
    # print(dir(cardinal))
    
    

# n = cardinals[0][0]
# dir(cardinals[0][0])

# list(n.ancestors)
# n.dep_
# n.head

# dir(n)



#     # print(sent)
# for ent in sent.ents:
#     print(ent)
#     print(ent.label_)

1(-4)
[]
5
[]
5
[]
20-30
{'lower': None, 'from': '20', 'to': '30', 'upper': None}
10-20;
{'lower': None, 'from': '10', 'to': '20', 'upper': None}


In [89]:
for ent in doc.ents:
    part = ent.sent._.anatomical_part if ent.sent._.anatomical_part else None
    
    # print(ent.label_)
    
    if ent.label_ != 'QUANTITY':
        continue
        
    print('----')
    print('PART:' , part)
    print(ent.label_)
    print(ent)
    print(ent.sent._.dimensions)
    

    
#     # Initiate field
#     # Factory      
    
#     # print(ent)
#     print(rows)
#     print('-----')
    
    # if part:
    #     break
    
#     # If we haven't found a term, it's for another plant group     
#     if rows.empty: continue
    
#     # Multiple terms, so validate part    
#     if len(rows) > 1:
#         continue
#         print('-----')
#         print(part)
        
#         print(rows)
        
#         mask = rows.part.isna() if not part else rows.part == part
#         rows = rows[mask]
        
        
        
#         print(rows.part.unique())
#         print(rows)
        
    # print(rows)
        
    
    # print(df[df.term == ent.lemma_])
        
    
    # print()
    
    # print(ent.label_)
    # print(part)
    # print(ent.lemma_)
    # break



----
PART: None
QUANTITY
40-100 cm
[1 x 2 cm]
----
PART: None
QUANTITY
1
[1 x 2 cm]
----
PART: None
QUANTITY
2 cm
[1 x 2 cm]
----
PART: leaf
QUANTITY
3.5-35+ cm
[3.5-35+ cm x 5-35 mm]
----
PART: leaf
QUANTITY
5-35 mm
[3.5-35+ cm x 5-35 mm]
----
PART: leaf
QUANTITY
1-2-pinnately
[3.5-35+ cm x 5-35 mm]
----
PART: palea
QUANTITY
1.5-4 mm
[]
----
PART: corolla
QUANTITY
1.5-3
[1.5-3 x 1.5-3 mm]
----
PART: corolla
QUANTITY
1.5-3 mm
[1.5-3 x 1.5-3 mm]
----
PART: corolla
QUANTITY
2-4.5 mm
[]
----
PART: cypsela
QUANTITY
1-2 mm
[]


In [75]:
from dataclasses import dataclass, field

# TODO: OrderedDict?

@dataclass
class Trait:
    l: list[int] = field(default_factory=list)

d = {}

for x, y in [('a', 1), ('b', 2), ('a', 5)]:
    if not x in d: d[x] = Trait()    
    d[x].l.append(y)
        
        
    # print(x)

d



{'a': Trait(l=[1, 5]), 'b': Trait(l=[2])}

In [54]:
t = 'v-form'

In [55]:
t.split('-')

['v', 'form']

In [57]:
import re
re.split('(\W)', t)

['v', '-', 'form']

In [129]:



# fields['x']

discrete


TypeError: DiscreteField.__init__() missing 2 required positional arguments: 'name' and 'value'

In [170]:
x = [1]
x[1:]

[]

In [196]:
s = 'petal_measurement.x.min'

re.search(r'\b(x|y)\b', s).group(1)


'x'

In [92]:
s = '3cm3'

re.sub('3$', '³', s)

'3cm³'

In [44]:
x = {'field_name': {'y': {'min': 40.0, 'max': 100.0}}}

{'field_name.y.min': 40.0, 'field_name.y.max': 100.0}

In [59]:
import pint

ureg = pint.UnitRegistry()
ureg.default_format = '~P' # Add this if you want to use abbreviated unit names.
accel = 1.3 * ureg['meter/second**2']
print(f'{accel}')

1.3 m/s²


In [19]:
from adept.tasks.descriptions.ecoflora import EcofloraDescriptionTask

In [31]:
x = [
    EcofloraDescriptionTask('hey'),
    EcofloraDescriptionTask('123'),
]

In [29]:
hash(x)

TypeError: unhashable type: 'list'

In [35]:
import pygbif
#pygbif.caching(True)
from pygbif import species

In [37]:
species.name_backbone(name='Hippuris vulgaris', kingdom='plants')

send: b'GET /v1/species/match?name=Hippuris+vulgaris&kingdom=plants&strict=false&verbose=false&limit=100 HTTP/1.1\r\nHost: api.gbif.org\r\nuser-agent: python-requests/2.28.1,pygbif/0.6.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Vary: Origin, Access-Control-Request-Method, Access-Control-Request-Headers
header: X-Content-Type-Options: nosniff
header: X-XSS-Protection: 1; mode=block
header: Pragma: no-cache
header: Expires: 0
header: X-Frame-Options: DENY
header: Content-Type: application/json
header: Date: Sun, 15 Jan 2023 20:59:36 GMT
header: Cache-Control: public, max-age=3601
header: X-Varnish: 815564009
header: Age: 0
header: Via: 1.1 varnish (Varnish/6.0)
header: Accept-Ranges: bytes
header: Content-Length: 470
header: Connection: keep-alive


{'usageKey': 5372503,
 'scientificName': 'Hippuris vulgaris L.',
 'canonicalName': 'Hippuris vulgaris',
 'rank': 'SPECIES',
 'status': 'ACCEPTED',
 'confidence': 100,
 'matchType': 'EXACT',
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Lamiales',
 'family': 'Plantaginaceae',
 'genus': 'Hippuris',
 'species': 'Hippuris vulgaris',
 'kingdomKey': 6,
 'phylumKey': 7707728,
 'classKey': 220,
 'orderKey': 408,
 'familyKey': 2420,
 'genusKey': 3039572,
 'speciesKey': 5372503,
 'synonym': False,
 'class': 'Magnoliopsida'}

In [41]:
p = pd.read_csv(RAW_DATA_DIR / 'species-example.csv')

In [73]:

p[['Species name', 'Group']].values.tolist()


taxa = p['Species name'].unique()

list(zip(taxa, ['hey'] * len(taxa)))

# for name, group in p[['Species name', 'Group']].drop_duplicates().values.tolist():
#     print(name)

# p.columns.values.tolist()

# taxa

[('Pyrus vsevolodovii', 'hey'), ('Pyrus chosrovica', 'hey')]