In [1]:
import typer
import pandas as pd
import spacy
from pathlib import Path
from tqdm import tqdm
import numpy as np
import warnings
import yaml
import re
from spacy import displacy
from adept.traits.accdb import ACCDBTraits
from spacy.tokens import Span
from adept.config import unit_registry

from adept.components.registry import ComponentsRegistry
from adept.components.sentencizer import Sentencizer
from adept.components.numeric import (NumericDimension, NumericExpand, NumericFraction, NumericMeasurement, NumericRange)
from adept.components.anatomical import AnatomicalEntity
from adept.components.traits import (CustomTraitsEntity, DiscreteTraitsEntity, NumericTraitsEntity)
from adept.scripts.helpers import get_descriptions
from adept.postprocess.postprocess import Postprocess
from adept.utils.doc_log import DocLog
from adept.config import logger, RAW_DATA_DIR, CORPUS_DIR, ASSETS_DIR, spacy_config, unit_registry


In [2]:
%load_ext autoreload
%autoreload 2

In [8]:
nlp = spacy.load("en_core_web_trf")
registry = ComponentsRegistry(nlp)
registry.add_components([
    Sentencizer,
    AnatomicalEntity,
    CustomTraitsEntity,
    DiscreteTraitsEntity,
    NumericTraitsEntity,
    NumericExpand,
    NumericDimension,
    NumericMeasurement,
    NumericRange,
    NumericFraction,
])

INIT custom_sentencizer
INIT custom_traits_entity
INIT numeric_trait_entity
INIT numeric_expand
INIT numeric_dimensions
INIT numeric_measurements
INIT numeric_range
INIT numeric_fraction


In [31]:
description = "Rhizomes yellowish inside, tuberous. Pseudostems 0.6-2 m. Leaves sessile or shortly petiolate; ligule entire, 1.5-2 cm; leaf blade lanceolate or oblong-lanceolate, 15-40 x 3-8 cm, glabrescent or abaxially somewhat pilose, base narrowed, apex acuminate. Inflorescences arising from rhizomes, conical or ovoid-oblong, 6-15 x 3.5-5 cm, apex obtuse; peduncle 10-30 cm, scalelike sheaths 5-7; bracts closely imbricate, green when young, red when old, slightly hairy, slimy adaxially, margin membranous; bracteoles ca. 1.5 cm. Calyx 1.2-2 cm, membranous, split on 1 side, apex 3-toothed. Corolla tube 2-3 cm, slender; lobes pale yellow, lanceolate, central one 1.5-2.5 cm. Labellum pale yellow, ca. 1.5 x 2.5 cm; central lobe suborbicular or subobovate, 1.5-2 x ca. 1.5 cm, apex emarginate; lateral lobes obovate, ca. 1 cm, free nearly to base. Stamen ca. 1 cm; connective appendage beaklike, ca. 8 mm. Ovary ca. 4 mm, glabrous. Capsule ellipsoid, 0.8-1.2 cm. Seeds black. Flower. Jul-Sep, fruit. Oct. 2 n = 22*."

doc = nlp(description)



In [32]:
print(doc.ents)

(Rhizomes, yellowish, tuberous, 0.6-2 m., Leaves, sessile, ligule, entire, 1.5-2 cm, leaf, blade, lanceolate, oblong, lanceolate, 15-40, 3-8 cm, glabrescent, pilose, acuminate, Inflorescences, rhizomes, ovoid, oblong, 6-15, 3.5-5 cm, obtuse, peduncle, 10-30 cm, scalelike, sheaths, 5-7, bracts, imbricate, green, red, hairy, membranous, ca. 1.5 cm, Calyx, 1.2-2 cm, membranous, 1, 3-toothed, Corolla tube, 2-3 cm, lobes, pale yellow, lanceolate, one 1.5-2.5 cm, Labellum, pale yellow, ca. 1.5, 2.5 cm, lobe, suborbicular, subobovate, 1.5-2, ca. 1.5 cm, emarginate, lobes, obovate, ca. 1 cm, free, Stamen, ca. 1 cm, ca. 8 mm, Ovary, ca. 4 mm, glabrous, Capsule, ellipsoid, 0.8-1.2 cm, Seeds, black, Flower, Jul-Sep, fruit, Oct. 2, 22)


In [33]:
# displacy.render(doc, style="ent")

trait_ents = [ent for ent in doc.ents if ent.label_ == 'TRAIT']
# colour_ents = [ent for ent in doc.ents if ent.label_ == 'COLOUR']



# for ent in doc.ents:
#     print(ent.label_)

# print(trait_ents)

In [12]:
accdb = ACCDBTraits()

df = accdb.get_terms('angiosperm')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [35]:
anatomical_parts_path = Path(ASSETS_DIR / spacy_config['file']['anatomical_parts'])
anatomical_parts = yaml.full_load(anatomical_parts_path.open())
part_synonyms = [syns + [part] for part, syns in anatomical_parts.items() if syns]

  

class Field():  
    
    doc = None
    
    def __init__(self, doc, name, part, require_part):
        self.name = name
        self.part = part
        self.require_part = require_part
        self.part_synonyms = self._get_part_synonyms(part) or []
        self._value = None

    @property
    def value(self):
        return self._value
    
    def as_string(self):
        if isinstance(self.value, (list, set)):
            return ','.join(self.value)
                    
        return self.value
    
    def _get_part_synonyms(self, part):
        for syns in part_synonyms:
            if part in syns:
                return syns
            
    def get_sents_filtered_by_part(self, doc):
        if not self.require_part:
            print('NOPE')
            yield from doc.sents
        
        for sent in doc.sents:
            if self.sent_matches_part(sent):
                yield sent
                           
    def sent_matches_part(self, sent):
        if not self.part:
            return True
        else:
            sent_part = self._get_sent_part(sent)
            if sent_part == self.part or sent_part in self.part_synonyms:
                return True
            
        return False
    
    def _get_sent_part(self, sent):
        if sent.start == 0:
            return 'plant'
        elif sent._.anatomical_part:
            return sent._.anatomical_part.lemma_            


class MeasurementField(Field):
    
    # Length/height measurements are provided first, followed by width. 
    dimension_axes = ['y', 'x']  
    require_sent_part = True
    
    def __init__(self, doc, name, part=None, require_part=True):
        super().__init__(doc, name, part, require_part) 
        self.unit = None
        self.parse_value_from_doc(doc)
        
    @property
    def value(self):
        return self.get_converted_value()
    
    def as_string(self):   
        return self.value
    
    def parse_value_from_doc(self, doc):
        for sent in self.get_sents_filtered_by_part(doc):
            print(sent)
            if sent._.dimensions:
                self.parse_dimension_value(sent._.dimensions[0])
                break
            elif sent._.measurements:
                print(sent._.measurements)
                self.parse_measurement_value(sent._.measurements)                
                break
                        
    def parse_dimension_value(self, dimension):

        for i in range(0,2):
             # 0 => 1; 1 => 0
            adj_i = (i-1)**2

            ent = dimension.ents[i]
            axis = self.dimension_axes[i]
            # Sometimes the unit is only attached to one of the dimensions e.g. 1.5-2 x 1.7-2.2 cm             
            unit = ent._.measurement_unit or dimension.ents[adj_i]._.measurement_unit
            self.set_value(axis, ent, unit)

    
    def parse_measurement_value(self, measurements):
        # If we have two measurements, treat them as y, x         
        if len(measurements) == 2:
            for axis, measurement in zip(self.dimension_axes, sent._.measurements):
                self.set_value(axis, measurement, measurement._.measurement_unit)
                
        # FIXME: If we only have one measurement, default to the one expected by the field          
        elif len(measurements) == 1:
            measurement = measurements[0]
            # Default to the field axis             
            self.set_value(self.field_axis, measurement, measurement._.measurement_unit)

    def set_value(self, axis, ent, unit):

        if axis == self.field_axis:
            value_dict = self._get_ent_value(ent)
            unpack = lambda ks: ([v for k in ks if (v := value_dict.get(k))])
            if self.is_minimum:
                self._value = min(unpack(['lower', 'from']), default=None)
            elif self.is_maximum:
                self._value = max(unpack(['to', 'upper']), default=None)
            else:
                raise Exception('Not min or max')

            self.unit = unit
            
    def get_converted_value(self):
        if self._value:
            measurement = float(self._value) * self.unit
            converted_value = measurement.to(self.target_unit) 
            return round(converted_value, 2)        
                        
    @property
    def field_axis(self):
        # length, height and depth are y; width is x axis         
        return 'x' if 'width' in self.name else 'y'
    
    @property
    def target_unit(self):
        if match := re.search('\[([a-z³]+)\]', self.name):
            unit = match.group(1)
            return unit_registry(unit)
        
    # @property
    # def target_unit_symbol(self):
    #     return unit_registry.get_symbol(str(self.target_unit.units))
    
    @property
    def is_minimum(self):
        # Does the field name contain min.         
        return re.search('\smin.', self.name)      
    
    @property
    def is_maximum(self):
        # Does the field name contain max.         
        return re.search('\smax.', self.name)      
    
    @staticmethod
    def _get_ent_value(ent: Span):
        if ent._.numeric_range:
            value = ent._.numeric_range
        else:

            # Also validate shape is d, dd, so cast to int won't fail
            num = [int(token.text) for token in ent if token.pos_ == 'NUM' and set(token.shape_) == set('d')]
            value = {'from': min(num, default=None), 'to': max(num, default=None)} 
   
        return value     
                                                            
# class DimensionField(Field):


class DiscreteField(Field):
        
    accdb_traits = None
    
    @classmethod
    def set_accdb_traits(cls, ents, accdb_terms):
        accdb_traits = set()
        trait_ents = [ent for ent in ents if ent.label_ == 'TRAIT']
        for ent in trait_ents:
            term = ent.text.lower().replace('-', '_')  
            part = ent.sent._.anatomical_part.lemma_ if ent.sent._.anatomical_part else None
            for row in accdb_terms[(accdb_terms.term == term) | (accdb_terms.term == ent.lemma_)].itertuples(): 
                accdb_traits.add((row.trait, row.character, part))
                
        cls.accdb_traits = pd.DataFrame(accdb_traits, columns=['trait', 'character', 'part'])    
                  
    def __init__(self, doc, name, part=None, require_part=True, accdb_name=None):
        super().__init__(doc, name, part, require_part) 
        self.accdb_name = accdb_name if accdb_name else self.name        
        self._value = self.get_accdb_chars()
        
    def get_accdb_chars(self):
        mask = self.accdb_traits['trait'] == self.accdb_name
                
        if self.part:
            mask = (mask) & (self.accdb_traits['part'].isin(self.part_synonyms))
                        
        accdb_chars = self.accdb_traits[mask] 
            
        return set(accdb_chars.character.values)
            
class NumericField(Field):
    
    def __init__(self, doc, name, part=None, require_part=True):
        super().__init__(doc, name, part, require_part) 
        self._value = self.parse_value_from_doc(doc)

    def parse_value_from_doc(self, doc):
        for sent in self.get_sents_filtered_by_part(doc):
            for ent in sent.ents:
                if ent.label_.lower() == self.name:
                    return ent._.trait_value
                
class ColourField(Field):  
    def __init__(self, doc, name, part=None, require_part=True):
        super().__init__(doc, name, part, require_part) 
        self._value = self.parse_value_from_doc(doc)  
        
    def parse_value_from_doc(self, doc):
        for sent in self.get_sents_filtered_by_part(doc):
            return [ent.lemma_ for ent in sent.ents if ent.label_ == 'COLOUR']

DiscreteField.set_accdb_traits(doc.ents, accdb.get_terms('angiosperm'))

fields = yaml.safe_load((CORPUS_DIR / 'fields.yaml').open())['angiosperm']

class VolumeField(MeasurementField): 
    def parse_value_from_doc(self, doc):
        for sent in self.get_sents_filtered_by_part(doc):
            if sent._.volume_measurements:
                self.parse_measurement_value(sent._.volume_measurements)                
                break    

class FieldFactory(object):
    
    classes = {
        'colour': ColourField,
        'discrete': DiscreteField,
        'measurement': MeasurementField,
        'numeric': NumericField,
        'volume': VolumeField,
    }
    
    @classmethod
    def factory(cls, doc, field_dict):
        field_type = field_dict.pop('type', 'DISCRETE').lower()
        return cls.classes[field_type](doc, **field_dict)
    
    # factory = staticmethod(factory) 

for field_dict in fields:    
    if field_dict['name'] != 'indumentum': continue
    print(field_dict)
    field = FieldFactory.factory(doc, field_dict)
    print(field.value)
    # print(re.search('\[([a-z³]+)\]', 'sss [mm]'))
          
        

          # print(field.value)
    
    # print(field.value)
    
    
#     # TODO - field factory
    
#     # print(field)
#     field_type = field.pop('type', 'DISCRETE')

#     if field_type == 'COLOUR':
#         f = ColourField(doc, **field)
        
#         print(f.value)    
    
#     if field_type == 'NUMERIC':
#         print(field)
#         f = NumericField(doc, **field)
        
#         print(f.value)
        # print(field)
        # ents = [ent.label_ for ent in doc.ents if ent._.trait_value] 
        # print(ents)        

#     if field_type == 'DIMENSION':
        
        
#         field_cls = MeasurementField(doc, **field)
#         print(field_cls.value)
        
#     elif field_type == 'DISCRETE':
#         field_cls = DiscreteField(doc, **field)
#         print(field_cls.value)
        
        # break

    
    continue
    
#     # DISCRETE     
#     accdb_name = field.get('accdb_name', field['name'])
    
#     # print(field)
    
#     try:
#         part = field['part']
#     except KeyError:
#         accdb_chars = accdb_traits[accdb_traits['trait'] == accdb_name]
#     else:
#         parts_list = get_part_with_synonyms(part)
#         accdb_chars = accdb_traits[(accdb_traits['trait'] == accdb_name) & (accdb_traits['part'].isin(parts_list))]
#     finally:
#         if not accdb_chars.empty:
#             print(field['name'])
#             print(set(accdb_chars.character.values))
        
    # print(field)
    # if field['part']:
    #     print(field['part'])
        
    # chars = accdb_traits[accdb_traits['trait'] == accdb_name]
    # print(chars)
    
    

{'name': 'indumentum', 'type': 'DISCRETE'}
{'hairy', 'glabrous', 'pilose', 'pubescent', 'glabrescent'}


In [38]:
print(dir(field.target_unit))

['T', 'UnitsContainer', '_Quantity__handling', '_Quantity__ito_if_needed', '_Quantity__used', '_REGISTRY', '__abs__', '__add__', '__annotations__', '__array__', '__array_function__', '__array_priority__', '__array_ufunc__', '__bool__', '__bytes__', '__class__', '__class_getitem__', '__complex__', '__copy__', '__dask_graph__', '__dask_keys__', '__dask_optimize__', '__dask_postcompute__', '__dask_postpersist__', '__dask_scheduler__', '__dask_tokenize__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__div__', '__divmod__', '__doc__', '__eq__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__idiv__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__orig_bases__', '__paramete

In [48]:
field.target_unit.to_tuple()

# print(format(field.target_unit, '~')) 



1 mm ** 3


In [355]:
import itertools

set(itertools.chain.from_iterable(df[df.trait.str.contains('colour')][['term', 'character']].values))

{'azure',
 'beige',
 'black',
 'black-girdled',
 'blackish',
 'blue',
 'blue-gray-green',
 'blue-lilac',
 'blue-mauve',
 'blue-purple',
 'blue-violet',
 'bluer',
 'bluish',
 'bluish-tipped',
 'brown',
 'brown-and-white',
 'browner',
 'brownish',
 'burgundy',
 'cherry',
 'cinereous',
 'cinereous-brunescent',
 'cinereous-luteous',
 'copper',
 'copper-brown',
 'coppery',
 'coral',
 'dark-brown',
 'dark-pink',
 'darker-pink',
 'emerald',
 'ferrugineous',
 'ferruginous',
 'flavescent',
 'fleshy-mauve',
 'fuchsia',
 'fuscous',
 'gold',
 'gray',
 'gray-blue-black',
 'gray-brown-strigose',
 'gray-green-strigose',
 'gray-red-brown',
 'gray-white',
 'graying',
 'grayish',
 'green',
 'green-and-white',
 'green-and-white-mottled',
 'green-and-white-striped',
 'green-tipped',
 'greene',
 'greener',
 'greenish',
 'greenish-tipped',
 'grey',
 'hazel',
 'ivory',
 'lavender',
 'lilac',
 'lime',
 'maroon',
 'mauve',
 'mauvish',
 'ochraceous',
 'ochre',
 'orange',
 'orangebrown',
 'orangish',
 'pale-blui

In [103]:
anatomical_parts_path = Path(ASSETS_DIR / spacy_config['file']['anatomical_parts'])
anatomical_parts = yaml.full_load(anatomical_parts_path.open())

# synonyms_df = pd.DataFrame([syns + [part] for part, syns in anatomical_parts.items() if syns], columns=['synonym'])

synonyms = [syns + [part] for part, syns in anatomical_parts.items() if syns]


x = [s for s in synonyms if 'leaf' in s]


# print(anatomical_parts)

# synonyms = {syn: part for part, synonyms in anatomical_parts.items() for syn in synonyms or [] }


SyntaxError: 'yield' outside function (1589971710.py, line 9)

In [86]:
print(synonyms)

{'androecia': 'androecium', 'berries': 'berry', 'calyces': 'calyx', 'cambia': 'cambium', 'caules': 'caulis', 'clinandria': 'clinandrium', 'cortices': 'cortex', 'costae': 'costa', 'cypselae': 'cypsela', 'dichasia': 'dichasium', 'epidermises': 'epidermis', 'drupe': 'fruit', 'drupelet': 'fruit', 'pome': 'fruit', 'gynoecia': 'gynoecium', 'hibernacula': 'hibernaculum', 'hypanthia': 'hypanthium', 'capitula': 'inflorescence', 'capitulum': 'inflorescence', 'head': 'inflorescence', 'floret': 'inflorescence', 'ray floret': 'inflorescence', 'flower': 'inflorescence', 'ray-floret': 'inflorescence', 'ray flower': 'inflorescence', 'inferior ovaries': 'inferior ovary', 'involucra': 'involucre', 'laminae': 'lamina', 'leaves': 'leaf', 'leave': 'leaf', 'blade': 'leaf', 'blades': 'leaf', 'lemmata': 'lemma', 'monochasia': 'monochasium', 'nectaries': 'nectary', 'nucelli': 'nucellus', 'ocreae': 'ocrea', 'ovaries': 'ovary', 'paleae': 'palea', 'pappi': 'pappus', 'corollas': 'petal', 'placentae': 'placenta', '

In [149]:
sents = list(doc.sents)
sents[2].start

16

In [187]:
x = ['b']

In [188]:
x*2 if len(x) == 1 else x

['b', 'b']

In [196]:
# 0 => 1; 1 => 0

x = 1

(x-1)**2


0

In [226]:
import re

re.search('\[([a-z]+)\]', 'petal min. width [cm]').group(1)

'cm'

In [54]:
df = pd.read_csv('test.csv')

In [55]:
df.head()

Unnamed: 0.1,Unnamed: 0,taxon,life form,habitat,habit,clonality,perennial organ,plant min. height [m],plant max. height [m],indumentum,...,seed min. length [mm],seed max. length [mm],dispersal mode,dispersion axillary,ploidy level (2n),seed min. volume [mm³],seed max. volume [mm³],root min. depth [cm],root max. depth [cm],source
0,0,Liquidambar acalycina,perennial,,"tree,tree/shrub","solitary plant,seeds",trunk,25.0 m,25.0 m,"glabrous,tomentose,pubescent",...,,,,"filaments,seed winged,absent",,,,,,flora_of_china
1,1,Liquidambar formosana,,,"tree,tree/shrub","solitary plant,seeds",trunk,,,"glabrous,pubescent,glandular",...,,,,filaments,,,,,,flora_of_china
2,2,Liquidambar formosana,perennial,,"tree,tree/shrub",solitary plant,trunk,,,,...,,,,,,,,,,wikipedia
3,3,Quercus acrodonta,perennial,,"tree,tree/shrub,shrub",solitary plant,trunk,,,"tomentose,pubescent,stellate",...,,,,,,,,,,flora_of_china
4,4,Quercus acutissima,perennial,,tree,solitary plant,trunk,,,"glabrous,tomentose,sericeous,glabrescent",...,,,,,,,,,,flora_of_china


In [117]:
import itertools
x = df.groupby('taxon')

# [y.explode]

def merge(series):
    
    series_no_nan = [x for x in series.values if pd.notnull(x)]
    
    if not series_no_nan:
        return None

    if series.dtype in ('int64', 'float64'):
        return min(series.values)
    else:
        return ','.join(set(itertools.chain.from_iterable(v.split(',') for v in series_no_nan)))

for name, group in df.groupby('taxon', dropna=False):
    group.reset_index(drop=True, inplace=True)
    row = {col: merge(g) for col, g in group.items()}
    row['taxon'] = name
    
    


{'Unnamed: 0': 0, 'taxon': 'Liquidambar acalycina', 'life form': 'perennial', 'habitat': None, 'habit': 'tree/shrub,tree', 'clonality': 'solitary plant,seeds', 'perennial organ': 'trunk', 'plant min. height [m]': '25.0 m', 'plant max. height [m]': '25.0 m', 'indumentum': 'tomentose,glabrous,pubescent', 'spinescence': None, 'succulence': None, 'leaf arrangement': None, 'leaf architecture': 'palmate', 'leaf position': None, 'leaf shape': 'ovate', 'leaf apex': 'acute', 'leaf base': None, 'leaf margin': None, 'leaf min. width [cm]': '8.0 cm', 'leaf max. width [cm]': '15.0 cm', 'leaf min. length [cm]': '8.0 cm', 'leaf max. length [cm]': '13.0 cm', 'inflorescence arrangement': 'raceme,single flower/solitary flower', 'flower sex': 'unisexual', 'flower architecture': None, 'flower merosity': None, 'flower symmetry': None, 'flower shape': None, 'flower colour': 'brown,black', 'petal fusion': None, 'petal colour': None, 'petal min. width [cm]': None, 'petal max. width [cm]': None, 'calyx colour'

In [29]:
df2 = pd.read_excel(CORPUS_DIR / 'traits/angiosperm.xlsx')

In [30]:
# df.head()

def combine_species_with_varieties(df):
    for i, row in df[df.taxon.str.contains('var.')].iterrows():
        # For varieties, we want to reset plant height as it's rarely included     
        for reset_col in ['plant min. height [m]', 'plant max. height [m]']:
            df.loc[i, reset_col] = np.nan
            
        species_name = row.taxon.split('var.')[0].strip()  

        try:
            species = df[df.taxon == species_name].iloc[0]
        except IndexError:
            print('No species for ', row.taxon)
        else:
            for col, value in species[species.notna()].items():
                # Does the original have a value
                if pd.isnull(df.loc[i, col]) or df.loc[i, col] == '':
                    print (value)
                    df.loc[i, col] = value       
      
    return df


combine_species_with_varieties(df2)



Unnamed: 0.1,Unnamed: 0,taxon,life form,habitat,habit,clonality,perennial organ,plant min. height [m],plant max. height [m],indumentum,...,seed min. length [mm],seed max. length [mm],dispersal mode,dispersion axillary,ploidy level (2n),seed min. volume [mm³],seed max. volume [mm³],root min. depth [cm],root max. depth [cm],source
0,0,Alpinia bambusifolia,,,,solitary plant,,,,"hairy,glabrous,pubescent,ciliate",...,,,,absent,,,,,,flora_of_china
1,1,Alpinia blepharocalyx,perennial,,,seeds,,0.01 m,0.01 m,"tomentose,pubescent,glabrescent,glabrous,hirsu...",...,12.0 mm,16.0 mm,,,,,,,,flora_of_china
2,2,Alpinia blepharocalyx var. blepharocalyx,perennial,,,seeds,,0.01 m,0.01 m,villouse,...,12.0 mm,16.0 mm,,,,,,,,flora_of_china
3,3,Alpinia blepharocalyx var. glabrior,perennial,,,seeds,,0.01 m,0.01 m,glabrous,...,12.0 mm,16.0 mm,,,,,,,,flora_of_china
4,4,Alpinia brevis,,,,seeds,,,,"glabrous,pubescent,villouse",...,4.0 mm,5.0 mm,,absent,,,,,,flora_of_china
5,5,Alpinia calcarata,,,tree/shrub,solitary plant,trunk,,,"velutinous,glabrous,pubescent,sericeous",...,,,ectzoochory,"awn,bristles",,,,,,flora_of_china
6,6,Alpinia conchigera,,,tree/shrub,"solitary plant,seeds",trunk,0.01 m,0.01 m,"tomentose,pubescent,glabrous",...,,,,,,,,,,flora_of_china
7,7,Alpinia coriacea,,,,,,0.5 m,0.6 m,"ciliate,glabrous,pubescent,pilose",...,,,,absent,,,,,,flora_of_china
8,8,Alpinia coriandriodora,perennial,,erect leafy/tussock,"solitary plant,rhizome",rhizome,,,"hairy,velutinous,pubescent,glabrescent,glabrou...",...,,,,,,,,,,flora_of_china
9,9,Alpinia dolichocephala,,,,,,,,"glabrous,sericeous,hirsute,villouse",...,,,,absent,,,,,,flora_of_china


In [27]:
df2.head()

Unnamed: 0.1,Unnamed: 0,taxon,life form,habitat,habit,clonality,perennial organ,plant min. height [m],plant max. height [m],indumentum,...,seed min. length [mm],seed max. length [mm],dispersal mode,dispersion axillary,ploidy level (2n),seed min. volume [mm³],seed max. volume [mm³],root min. depth [cm],root max. depth [cm],source
0,0,Alpinia bambusifolia,,,,solitary plant,,,,"ciliate,pubescent,glabrous,hairy",...,,,,absent,,,,,,flora_of_china
1,1,Alpinia blepharocalyx,perennial,,,seeds,,0.01 m,0.01 m,"glabrous,hirsute,ciliate,tomentose,villouse,pu...",...,12.0 mm,16.0 mm,,,,,,,,flora_of_china
2,2,Alpinia blepharocalyx var. blepharocalyx,perennial,,,seeds,,0.01 m,0.01 m,villouse,...,12.0 mm,16.0 mm,,,,,,,,flora_of_china
3,3,Alpinia blepharocalyx var. glabrior,perennial,,,seeds,,0.01 m,0.01 m,glabrous,...,12.0 mm,16.0 mm,,,,,,,,flora_of_china
4,4,Alpinia brevis,,,,seeds,,,,"glabrous,pubescent,villouse",...,4.0 mm,5.0 mm,,absent,,,,,,flora_of_china


In [128]:
x = df[df['Species name'].str.contains('var.')]
x['Species name'] = df['Species name'].str.split('var.').str.get(0)

df.append(x).drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Species name'] = df['Species name'].str.split('var.').str.get(0)


Unnamed: 0,Species name,Family,Major group
0,Alpinia bambusifolia,,Angiosperm
1,Alpinia blepharocalyx,,Angiosperm
2,Alpinia blepharocalyx var. blepharocalyx,,Angiosperm
3,Alpinia blepharocalyx var. glabrior,,Angiosperm
4,Alpinia brevis,,Angiosperm
...,...,...,...
133,Globba schomburgkii,,Angiosperm
146,Hedychium forrestii,,Angiosperm
162,Hedychium spicatum,,Angiosperm
167,Hedychium villosum,,Angiosperm


In [125]:
x

Unnamed: 0,Species name,Family,Major group
2,Alpinia blepharocalyx,,Angiosperm
3,Alpinia blepharocalyx,,Angiosperm
15,Alpinia galanga,,Angiosperm
16,Alpinia galanga,,Angiosperm
30,Alpinia maclurei,,Angiosperm
55,Alpinia stachyodes,,Angiosperm
56,Alpinia stachyodes,,Angiosperm
58,Alpinia strobiliformis,,Angiosperm
59,Alpinia strobiliformis,,Angiosperm
102,Amomum villosum,,Angiosperm


In [2]:
df = pd.read_csv(ASSETS_DIR / 'susies-species.csv')

In [36]:
# for i, row in df[df['Species name'].str.contains('var\.|subsp\.')].iterrows():
#     n = row['Species name']
#     # Regex to match species name before var. or subsp.   
#     print(n)
#     species_name = re.match('^[a-zA-Z\s]+(?=(\svar\.|\ssubsp\.))', n).group(0)
#     print(species_name)
    
#     # m = re.match()
#     # print(n)
    

# df[df['Species name'].str.contains('var.|subsp.')].head(100)
# df.head()

df = pd.read_csv(ASSETS_DIR / 'susies-species.csv')

def get_variety_higher_taxa(name):
    # Regex to match species name before var. or subsp.  
    return re.match('^[a-zA-Z\s]+(?=(\svar\.|\ssubsp\.))', name).group(0)

var_species = df[df['Species name'].str.contains('var\.|subsp\.')].loc[:]

var_species['Species name'] = var_species['Species name'].apply(get_variety_higher_taxa)

# var_species.loc[:, 'Species name'] = 4

var_species.head()


Unnamed: 0,Species name,Family,Major group
2,Amygdalus davidiana,,Angiosperm
3,Amygdalus davidiana,,Angiosperm
19,Armeniaca mandshurica,,Angiosperm
20,Armeniaca mandshurica,,Angiosperm
22,Armeniaca mume,,Angiosperm


In [27]:
re.match('^[a-zA-Z\s]+(?=(\svar\.|\ssubsp\.))', 'Amygdalus davidiana var. davidiana').group(0)

'Amygdalus davidiana'

In [51]:
df1 = pd.read_excel(CORPUS_DIR / 'traits/angiosperm.xlsx', sheet_name="Combined")
df2 = pd.read_excel(CORPUS_DIR / 'traits/angiosperm.xlsx', sheet_name="flora_of_china")

df2['habit'] = 'pies, cheese'

In [72]:
df3 = df1.append(df2)

In [113]:
import itertools
import statistics

def agg_discrete(series):
    if len(series) > 1:
        combined_series = set(itertools.chain.from_iterable([s.split(',') for s in series]))
        return ', '.join([s for s in combined_series if pd.notnull(s)])
    return series
            
from adept.config import unit_registry    
    
def agg_measurement(series):
    if len(series) > 1:
        measurements = [unit_registry.Quantity(s) for s in series]    
        m = statistics.mean([meas.m for meas in measurements])
        units = {meas.u for meas in measurements}
        assert(len(units) == 1)
        return unit_registry.Quantity(m, units.pop())
    return series
    


df2.groupby('taxon').agg({'habit': agg_discrete,  'plant min. height [m]': agg_measurement})

# x = unit_registry.Quantity('2.0 m')
# print(x.m)


Unnamed: 0_level_0,habit,plant min. height [m]
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1
Amygdalus communis,"pies, cheese",2.0 m
Amygdalus davidiana,"pies, cheese",10.0 m


In [134]:
fields = yaml.safe_load((CORPUS_DIR / 'fields.yaml').open())['angiosperm']

class Aggregator():

    # Public class methods     
    @classmethod
    def agg_discrete(cls, series):
        return cls._agg('_agg_string', series)  
    
    @classmethod
    def agg_colour(cls, series):
        return cls._agg('_agg_string', series)
    
    @classmethod
    def agg_numeric(cls, series):
        return cls._agg('_agg_numeric', series) 
    
    @classmethod
    def agg_volume(cls, series):
        return cls._agg('_agg_measurement', series) 
    
    @classmethod
    def agg_measurement(cls, series):
        return cls._agg('_agg_measurement', series)      
    
    @classmethod
    def _agg(cls, method, series):
        # Remove any NAN series         
        series = [s for s in series if pd.notnull(s)]
        
        if not series:
            return None
        elif len(series) <= 1:
            return series
        
        return getattr(cls, method)(series)
                    
    @classmethod
    def _agg_string(cls, series):    
        combined_series = set(itertools.chain.from_iterable([s.split(',') for s in series]))
        return ', '.join([s for s in combined_series if pd.notnull(s)])        
        
    @classmethod
    def _agg_measurement(cls, series):
        measurements = [unit_registry.Quantity(s) for s in series]    
        m = statistics.mean([meas.m for meas in measurements])
        units = {meas.u for meas in measurements}
        assert(len(units) == 1)
        return unit_registry.Quantity(m, units.pop())
    
    @classmethod
    def _agg_numeric(cls, series):    
        return statistics.mean(series)
    

agg = {}
for field in fields:    
    agg[field['name']] = getattr(Aggregator, f"agg_{field['type'].lower()}")
    
df3.groupby('taxon').agg(agg)




Unnamed: 0_level_0,life form,habitat,habit,clonality,perennial organ,plant min. height [m],plant max. height [m],indumentum,spinescence,succulence,...,seed max. width [mm],seed min. length [mm],seed max. length [mm],dispersal mode,dispersion axillary,ploidy level (2n),seed min. volume [mm³],seed max. volume [mm³],root min. depth [cm],root max. depth [cm]
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Amygdalus communis,,,"tree, shrub, tree/shrub, pies, cheese",solitary plant,trunk,2.0 meter,8.0 meter,"glabrous, pubescent, glabrescent, tomentose, p...",unarmed,,...,,,,,,,,,,
Amygdalus davidiana,,,"pies, tree, cheese, tree/shrub",solitary plant,trunk,10.0 meter,10.0 meter,"pubescent, glabrous",,,...,,,,,,,,,,
