In [27]:
import typer
import pandas as pd
import spacy
from pathlib import Path
from tqdm import tqdm
import numpy as np
import warnings
import yaml
import itertools
from collections import OrderedDict
import luigi

from adept.components.registry import ComponentsRegistry
from adept.components.sentencizer import Sentencizer
from adept.components.numeric import (NumericDimension, NumericExpand, NumericFraction, NumericMeasurement, NumericRange)
from adept.components.anatomical import AnatomicalEntity
from adept.components.traits import DiscreteTraitsEntity
from adept.components.traits import CustomTraitsEntity
from adept.traits import Traits
from adept.tasks.patterns.anatomy import AnatomyPatternsTask, AnatomicalPartsTask
from adept.utils.helpers import token_get_ent
from adept.preprocess import Preprocess
from adept.config import RAW_DATA_DIR, INTERMEDIATE_DATA_DIR

import luigi
from traitlets import default
import json
from pathlib import Path

from adept.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, INTERMEDIATE_DATA_DIR
from adept.traits import Traits
from adept.utils.patterns import Patterns

from adept.tasks.descriptions.ecoflora import EcofloraDescriptionTask
from adept.tasks.descriptions.efloras import EflorasChinaDescriptionTask, EflorasMossChinaDescriptionTask, EflorasNorthAmericaDescriptionTask, EflorasPakistanDescriptionTask
from adept.pipeline import Pipeline
from adept.config import taxonomic_groups, logger
from adept.tasks.base import BaseTask

In [28]:
%load_ext autoreload
%autoreload 2

In [100]:
inputs = [INTERMEDIATE_DATA_DIR / 'descriptions' / 'bolboschoenus-maritimus.json', INTERMEDIATE_DATA_DIR / 'descriptions' / 'acacia-auriculiformis.json']

In [141]:
out = PROCESSED_DATA_DIR / 'traits.xlsx'

def merge_rows(rows):
    # Remove any empty rows     
    rows = rows[rows.notnull()]
        
    if rows.empty:
        return
    
    if is_numeric_dtype(rows):
        return rows.mean().round(2)
    
    num_unit = [g for row in rows if (g := num_unit_regex.match(row))]
    
    if num_unit:
        num = np.array([float(n.group(1)) for n in num_unit]).mean().round(2)
        unit = num_unit[0].group(2)   
        return f'{num} {unit}'
    
    # Ploidy 2n contains , so we don't want to split or concatenate on ','     
    text_list = [s.split(',') for s in rows if not s.startswith('2n')]
    if text_list:
        return ', '.join(set([t.strip() for t in itertools.chain(*text_list)]))
    else:
        return '| '.join(rows)        

dfs = []    
combined_dfs = []
for input_path in inputs:
    taxon = input_path.stem.replace('-', ' ').capitalize()
    df = pd.read_json(input_path)
    df.insert(0, 'taxon', taxon)
    combined = df.groupby('taxon').agg(merge_rows).reset_index()
    combined['source'] = 'combined'
    
    dfs.append(df)
    combined_dfs.append(combined)
    
    
combined_dfs = pd.concat(combined_dfs)
dfs = pd.concat(dfs)

nan_cols = set(dfs.columns).difference(set(['taxon', 'source']))

with pd.ExcelWriter(out) as writer:
    combined_dfs.to_excel(writer, sheet_name="combined", index=False)
    for source, group in dfs.groupby('source'):
        sheet_name = source.replace('/', '-')
        # If we don't have any values in a row, drop it         
        group = group.dropna(subset=nan_cols, how="all")
        group.to_excel(writer, sheet_name=sheet_name, index=False)  
        
    df.groupby('taxon').agg(merge_rows).reset_index()
        




df = pd.read_json(desc)

In [42]:
df['taxon'] = 'Eleocharis palustris'

In [50]:
df['stamen number']

0    3.0
1    NaN
2    3.0
Name: stamen number, dtype: float64

In [99]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from itertools import chain

num_unit_regex = re.compile('([\d\.]+)\s([a-z]+)')

def merge_rows(rows):
    # Remove any empty rows     
    rows = rows[rows.notnull()]
    
    if rows.empty:
        return
    
    if is_numeric_dtype(rows):
        return rows.mean().round(2)
    
    num_unit = [g for row in rows if (g := num_unit_regex.match(row))]
    
    if num_unit:
        num = np.array([float(n.group(1)) for n in num_unit]).mean().round(2)
        unit = num_unit[0].group(2)   
        return f'{num} {unit}'
    else:
        text = set([t.strip() for t in itertools.chain(*[s.split(',') for s in rows])])
        return ', '.join(text)
        
        
        
        # print(np.char.split(rows))
        
        # print(rows)
        # print(unit)
    
    
    print(num_unit)
    # return 'hey'

df.groupby('taxon').agg(merge_rows)


# p = re.compile('([\d\.]+)\s([a-z]+)')
# p.match('30020202.1919191910 cm').groups()



Unnamed: 0_level_0,life form,habitat,habit,clonality,perennial organ,plant min. height [m],plant max. height [m],indumentum,spinescence,succulence,...,seed max. width [mm],seed min. length [mm],seed max. length [mm],dispersal mode,ploidy level (2n),seed min. volume [mm³],seed max. volume [mm³],root min. depth [cm],root max. depth [cm],source
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Eleocharis palustris,perennial,"hygrophilous, aquatic","erect leafy, erect leafy/tussock",rhizome,"stem, rhizome",0.4 m,1.25 m,"pubescent, scabrous, glabrous",,,...,2.8 mm,2.3 mm,4.1 mm,ectzoochory,"104, 2n = 64, 112",,,,,"ecoflora, efloras/flora_of_north_america, eflo..."


In [13]:
pipeline = Pipeline()

INIT custom_sentencizer
INIT custom_traits_entity
INIT numeric_expand
INIT numeric_dimensions
INIT numeric_measurements
INIT numeric_range
INIT numeric_fraction


In [20]:
text = " A stout glabrous perennial, 30-100 cm. Rhizome producing short runners, becoming wiry and black, tuberous at tip. Stems sharply triquetrous, rough towards top, lfy. Lvs to c 10 mm wide, keeled, margins rough.  Infl c 5 cm, dense, corymbose, terminal. Bracts 2-3, lf-like to setaceous, the larger much longer than infl, patent to reflexed. Spikelets 10-20(-40) mm, rather few, ovoid to ellipsoid, terete, red-brown, sessile or in groups or 2-5(-10) at the ends of branches.  Glumes 5-7 x 2.5-3.2 mm, ovate, acute, apex bifid, awned from the sinus to 1.7 mm, glabrous to sparsely pubescent, spirally arranged. Bristles shorter than nut, brown. Stamens 3, filaments flattened; anthers 3-5 mm. Stigmas (2-)3.  Nut 2.2-4 mm, broadly obovate from a cuneate base, plano-convex, brown, shiny, smooth. "

In [36]:

taxon_group = 'angiosperm'
fields = pipeline(text, taxon_group)

In [38]:
import re

template_path = RAW_DATA_DIR / 'fields.tpl.yml'

class FieldOutputTemplate():
    
    """
    Load output field defintions from a template file, and map the output to the field names
    We do this, so th eoutput can be prepared for AC
    """
    
    regexes = {
        'unit': re.compile('\[([a-z³]+)\]'),
        'minmax': re.compile(r'\b(min|max)\b'),
        'axis': re.compile(r'\b(x|y)\b')
    }

    def __init__(self, template_path: Path):
        with template_path.open('r') as f:
            self._tpl = yaml.full_load(f)
        
    def get_data(self, fields):
        return {src: self._get_value(src, targets, fields) for src, targets in self._tpl.items()}
        
    def _get_value(self, src, targets, fields):        
        # Template can have a list of targets - so if just a string convert to a list         
        if not isinstance(targets, list):
            targets = [targets]
            
        for target in targets:
            if value := self._get_field_value(src, target, fields):
                return value
            
    def _get_field_value(self, src, target, fields):        
        field_dict = {}
        self._re('unit', src, field_dict)
        if target:
            field_name = target.split('.')[0]
            self._re('minmax', target, field_dict)
            self._re('axis', target, field_dict)
        else:
            field_name = src

        if field := fields.get(field_name):
            value = field.get_value(**field_dict)
            
    def _re(self, name, field_name, field_dict):        
        if match := self.regexes[name].search(field_name):
            field_dict[name] = match.group(1)
            
tpl = FieldOutputTemplate(template_path)
print(tpl.get_data(fields._fields))            

{'life form': None, 'habitat': None, 'habit': None, 'clonality': None, 'perennial organ': None, 'plant min. height [m]': None, 'plant max. height [m]': None, 'indumentum': None, 'spinescence': None, 'succulence': None, 'leaf arrangement': None, 'leaf architecture': None, 'leaf position': None, 'leaf shape': None, 'leaf apex': None, 'leaf base': None, 'leaf margin': None, 'leaf min. width [cm]': None, 'leaf max. width [cm]': None, 'leaf min. length [cm]': None, 'leaf max. length [cm]': None, 'inflorescence arrangement': None, 'flower sex': None, 'flower architecture': None, 'flower merosity': None, 'flower symmetry': None, 'flower shape': None, 'flower colour': None, 'petal fusion': None, 'petal colour': None, 'petal min. width [cm]': None, 'petal max. width [cm]': None, 'calyx colour': None, 'calyx min. length [cm]': None, 'calyx max. length [cm]': None, 'corolla colour': None, 'corolla tube min. length [cm]': None, 'corolla tube max. length [cm]': None, 'labellum colour': None, 'label