In [5]:
import pandas as pd
import re

from adept.config import RAW_DATA_DIR, logger
from adept.traits.accdb import ACCDBTraits
from dataclasses import dataclass, field

In [6]:
xslx_path = RAW_DATA_DIR / 'functional-trait-list.xlsx' 

In [7]:
# df = pd.(self.xslx, sheet_name=sheet_name)

xlsx = pd.ExcelFile(xslx_path)

In [78]:
def match_dimension(s):
    return re.search(r'(.*)[\(\[]([a-zμ]{1,2})[\)\]]', s)

def normalise_dimensions(columns):
    
    columns = [c.replace('maximum', 'max.').replace('minimum', 'min.').replace(' cm', ' [cm]') for c in columns]

    for column in columns:
        # Search for (xx) or [x)     
        match = match_dimension(column)
        if match and not re.search('(min|max)', column):
            trait = match.group(1).split()
            unit = match.group(2)  
            yield from [f'{trait[0]} {mm}. {trait[1]} [{unit}]' for mm in ['min', 'max']]
        else:
            yield column
    

xslx_columns = {}    
    
for sheet_name in xlsx.sheet_names:
    group = sheet_name.replace('traits', '').strip().lower()
    print(group)
    df = pd.read_excel(xslx_path, sheet_name=sheet_name)
    columns = list(map(str.lower, df.columns))
    columns = list(map(str.strip, columns))
    columns = list(normalise_dimensions(columns))
                
    replacement_columns = {
        'max. plant height (m)': ['plant min. height [m]', 'plant max. height [m]'],
        'plant min. size [cm]': ['plant min. height [cm]'],
        'plant max. size [cm]': ['plant max. height [cm]'],
        'capsule min. size [mm]': ['capsule min. height [mm]'],
        'capsule max. size [mm]': ['capsule max. width [mm]'],
        'seeds per fruit_max.': ['seeds per fruit max.'],
        'seeds per fruit_min.': ['seeds per fruit min.'],
        'max seed volume': ['seed min. volume', 'seed max. volume']
    }
    
    for col, replacement in replacement_columns.items():
        if col in columns:
            idx = columns.index(col)
            columns[idx:idx+1] = replacement

    xslx_columns[group] = columns 


xslx_columns['angiosperm']    

angiosperm
bryophyte
pteridophyte


['life form',
 'habitat',
 'habit',
 'clonality',
 'perennial organ',
 'plant min. height [m]',
 'plant max. height [m]',
 'indumentum',
 'spinescence',
 'succulence',
 'leaf arrangement',
 'leaf architecture',
 'leaf position',
 'leaf shape',
 'leaf apex',
 'leaf base',
 'leaf margin',
 'leaf min. width [cm]',
 'leaf max. width [cm]',
 'leaf min. length [cm]',
 'leaf max. length [cm]',
 'inflorescence arrangement',
 'flower sex',
 'flower architecture',
 'flower merosity',
 'flower symmetry',
 'flower shape',
 'flower colour',
 'petal fusion',
 'petal colour',
 'reproduction architecture',
 'reproduction system',
 'stamen number',
 'stamen arrangement',
 'carpel/ovary number',
 'gynoecium arrangement',
 'heterostyly',
 'pollination',
 'fruit type',
 'fruit structure',
 'fruit dehiscence',
 'fruit shape',
 'fruit colour',
 'dispersule min. width [cm]',
 'dispersule max. width [cm]',
 'dispersule min. length [cm]',
 'dispersule max. length [cm]',
 'seed colour',
 'seeds per fruit max.',

In [63]:
idx = xslx_columns['angiosperm'].index('petal colour')

xslx_columns['angiosperm'][idx+1:idx+1] = ['petal min. width [cm]', 'petal max. width [cm]']

xslx_columns['angiosperm'] += [
    'root min. depth [cm]', 
    'root max. depth [cm]',
]

print(xslx_columns)

{'angiosperm': ['life form', 'habitat', 'habit', 'clonality', 'perennial organ', 'plant min. height [m]', 'plant max. height [m]', 'indumentum', 'spinescence', 'succulence', 'leaf arrangement', 'leaf architecture', 'leaf position', 'leaf shape', 'leaf apex', 'leaf base', 'leaf margin', 'leaf min. width [cm]', 'leaf max. width [cm]', 'leaf min. length [cm]', 'leaf max. length [cm]', 'inflorescence arrangement', 'flower sex', 'flower architecture', 'flower merosity', 'flower symmetry', 'flower shape', 'flower colour', 'petal fusion', 'petal colour', 'petal min. width [cm]', 'petal max. width [cm]', 'reproduction architecture', 'reproduction system', 'stamen number', 'stamen arrangement', 'carpel/ovary number', 'gynoecium arrangement', 'heterostyly', 'pollination', 'fruit type', 'fruit structure', 'fruit dehiscence', 'fruit shape', 'fruit colour', 'dispersule min. width [cm]', 'dispersule max. width [cm]', 'dispersule min. length [cm]', 'dispersule max. length [cm]', 'seed colour', 'seeds

In [64]:
# Validate the columns against the ACCDBTraits

accdb = ACCDBTraits()


In [74]:
def is_numeric(field_name):
    for s in ['number', 'min', 'max', 'no.', '2n']:
        if s in field_name:
            return True

def get_field_type(field_name):
    if match_dimension(field_name):
        return 'DIMENSION'
    elif is_numeric(field_name):
        return 'NUMERIC'
    elif 'colour' in field_name.lower():
        return 'COLOUR'
    else:
        return 'DISCRETE'
    

min_max = ['min', 'max']

parts = [
    'leaf',
    'seed', 
    'flower', 
    'inflorescence',
    'petal',
    'stamen',
    'carpel/ovary',
    'gynoecium',
    'fruit',
    'dispersule',
    'seed',
    'lamina',
    'stem',
    'spore',
    'capsule',
    'seta'
]

fields = []

def match_part(field_name):
    
    part_mappings = {
        'dispersal mode': 'seed',
        'dispersion axillary': 'seed'
    }
    
    if part_mappings.get('field_name'):
        return part_mappings.get('field_name')
    
    parts_str = "|".join(parts)
    if match := re.search(f'(?P<part>{parts_str})', field_name):
        return match.group('part')
    

# Fields where the ACCDB name is different to the one used in the excel files     
accdb_aliases = {
    'life form': 'life cycle',
    'flower structure': 'flower architecture'
} 

    
@dataclass
class Field:           
    
    name: str
    field_type: str
    part: str = field(init=False)
    accdb_name: str = field(init=False, default=None, repr=False)

    def __post_init__(self):
        self.part = self._get_part_from_field_name(self.name)
        self.accdb_name = self._get_accdb_alias(self.name)
        
    def _get_part_from_field_name(self, field_name):

        part_mappings = {
            'dispersal mode': 'seed',
            'dispersion axillary': 'seed'
        }

        if part_mappings.get('field_name'):
            return part_mappings.get('field_name')

        parts_str = "|".join(parts)
        if match := re.search(f'(?P<part>{parts_str})', field_name):
            return match.group('part')      
        
    def _get_accdb_alias(self, field_name):
        try:
            return accdb_aliases[field_name]
        except KeyError:
            pass
        
    def as_dict(self):
        d = {
            'type': self.field_type,
            'name': self.name.strip()
        }
        if self.part:
            d['part'] = self.part
        if self.accdb_name:
            d['accdb_name'] = self.accdb_name             
        return d
        

def get_field_index_for_part(part):        
    for i, field in enumerate(reversed(fields)):
        if part and field.part == part:
            return (len(fields) - i) - 1
    return None
                              
    

group_fields = {}    
    
for group, columns in xslx_columns.items():
    
    # if group != 'pteridophyte': continue
    
    fields = []
    
    xlsx_field_names = set(columns)
    accdb_terms = accdb.get_terms(group)
    accdb_field_names = set(accdb_terms.trait.unique())
        
    xlsx_fields_not_in_accdb = xlsx_field_names - accdb_field_names - set(accdb_aliases.values())
    
    accdb_fields_not_in_xslxs = accdb_field_names - xlsx_field_names - set(accdb_aliases.keys())
    
    xlsx_missing_fields = {f for f in xlsx_fields_not_in_accdb if not is_numeric(f)}
    
    print('ACCDB fields not in XSLX:', accdb_fields_not_in_xslxs)
    print('XSLX missing fields (not in ACCDB or numeric):', xlsx_missing_fields)
    
    for field_name in columns:
        field_type = get_field_type(field_name)
        
        
        
#         if field_type == 'NUMERIC':
            
#             print('NUMERIC')
#             print(field_name)
#             # Min / max             
#             # if match := re.search('(?P<name1>[a-z\s]+)_?(min|max)|(min|max)\s?(?P<name2>[a-z\s]+)', field_name):
#             if match := re.search('(?P<name1>_?min\s|_?max\s)', field_name):

#                 name = match.group('name1') or match.group('name2')
                
#                 print(name)
                
# #                 for mm in min_max:                                        
                    
# #                     fields.append(Field(f'{name} {mm}.', 'NUMERIC'))
# #                     continue
                    
        fields.append(Field(field_name, field_type))
                
#     for accdb_field_name in accdb_fields_not_in_xslxs:
#         accdb_field = Field(accdb_field_name, 'DISCRETE')
#         accdb_field.disabled = True
        
#         if accdb_field.part:            
#             if field_index := get_field_index_for_part(accdb_field.part):        
#                 fields.insert(field_index, accdb_field)
#                 continue
        
#         fields.append(accdb_field)
            
    group_fields[group] = fields
    
    # break


ACCDB fields not in XSLX: {'leaf surface', 'life cycle', 'venation', 'gymnosperm reproductive structure', 'petal base', 'inflorescence architecture', 'reproductive structure', 'fertile frond form', 'scales', 'sepal base', 'leaf dissection', 'fruit apex', 'fruit base'}
XSLX missing fields (not in ACCDB or numeric): {'heterostyly'}
ACCDB fields not in XSLX: {'leaf architecture', 'dispersion axillary', 'specialised strubrures', 'lamina thickness', 'leaf arrangement', 'succulence', 'stamen arrangement', 'vegetative propagules ', 'fruit structure', 'leaf dissection', 'fruit apex', 'indusium', 'leaf position', 'leaf margin', 'spinescence', 'fruit type', 'leaf orinetation', 'flower sex', 'fruit colour', 'trnasverse cross-section', 'leaf base', 'flower architecture', 'fertile frond form', 'specialised structure', 'seed colour', 'pubescence', 'trait1\tcharacter1\r\ndispersal mode\tectzoochory', 'flower merosity', 'reproduction architecture', 'perennial organ', 'inflorescence arrangement', 'leaf

In [75]:
import yaml



In [77]:

group_fields_dict = {}
for group, fields in group_fields.items():
    group_fields_dict[group] = [f.as_dict() for f in fields]

f = open('fields.yaml', 'w+')
yaml.dump(group_fields_dict, f, allow_unicode=True)
    