In [152]:
import pandas as pd
import inflect
import numpy as np
import itertools
from decimal import Decimal
import csv

In [6]:
from adept.ontology.trait import TraitOntology

In [124]:
inflector = inflect.engine()

def to_singular(word):
    if word.endswith('ous'):
        return word
    return inflector.singular_noun(word) or word

In [125]:
trait_ontology = TraitOntology()

traits_dict = {}
angiosperm = trait_ontology.df[trait_ontology.df['Plants Group'] == 'Angiosperms']

for i in range(1, 4):
        
    trait = f"trait{i}"    
    terms = angiosperm.groupby(by=[trait])['term'].aggregate(lambda x: list(x))    
    for trait, values in terms.iteritems(): 
        values = set(values)
        values |= {to_singular(w) for w in values}
        traits_dict.setdefault(trait.lower(), set()).update(values)
        
        


In [126]:
traits_dict['perennial organ'] |= {'rhizome', 'rhizomes'}
traits_dict['habit'] |= {'stem erect'}

In [127]:
# import itertools
# import re



# # Lets convert the traits dicts to words so we can match
# def get_term_words(term):
#     common_words = {'to', 'like'}
#     words = set(itertools.chain(*[re.split('-|\s', t) for t in term])) - common_words
#     words |= {to_singular(w) for w in words}
#     return words

# trait_words_dict = {k:get_term_words(v) for k,v in traits_dict.items()}


In [166]:
df = pd.read_csv('characters.csv')

def dedupe(value):
    try:
        parts = value.lower().split(';')
    except AttributeError:
        return value
    else:        
        parts = {to_singular(p) for p in parts}
        return '; '.join(parts)
   
def rounded(x):
    return np.round(x, 2)

def split_and_strip(value):
    value = str(value).lower()
    value = value.replace('1/2', '0.5')
    value = value.replace('1/3', '0.3333')
    value = value.replace('1/4', '0.25')
    value = value.replace('feb', '02')
    value = value.replace('oct', '10')
    value = value.replace('may', '05')
    value = value.replace('dec', '12')
    value = value.replace('mar', '03')
    value = value.replace('one', '1')
    value = value.replace('>', '')
    # FFS: This removes all decimal places!!!     
    # value = value.replace('.', '')    
    return [v.strip() for v in value.split(';')]

def seperate_ranges(value):
    values = []
    for v in value.split('-'):
        try:
            values.append(float(v))
        except ValueError:
            continue
    return values
        
        
    

def to_mean(value):  
    # Split all the values     
    values = split_and_strip(value)
    # Some values are ranges 1-2, so for these we want to find the median   
    norm_values = [np.median(seperate_ranges(v)) for v in values]
    return rounded(np.mean(norm_values))

def _flatten_numeric(value):
    values = split_and_strip(value)
    # Extract the numbers from any numeric ranges, and join into one dimensional array
    try:
        return np.array(list(itertools.chain(*[seperate_ranges(v) for v in values if v])))    
    except ValueError:
        print(value)
        # raise
    
def to_min(value):  
    norm_values = _flatten_numeric(value)
    
    if norm_values.any():
        return rounded(norm_values.min())
    
def to_max(value):  
    norm_values = _flatten_numeric(value)
    if norm_values.any():
        return rounded(norm_values.max())

def limit_values_to_list(values, limit_to):        
    try:
        values = {v.strip() for v in values.lower().split(';')}
    except AttributeError:
        raise
    else:
        limited = []
        # print(values)
        for value in values:
            value_parts = set(re.split('-', value))
            # print(limit_to)
            # print(value_parts)
            # print(value_parts.intersection(limit_to))
            if value_parts.intersection(limit_to):
                limited.append(value)

        values = '; '.join(limited)
        
    return values
    
def to_range(value):
    minv = to_min(value)
    maxv = to_max(value)
    if minv != maxv:
        value = f'{minv} to {maxv}'
        
    return value
    

def get_raunkiaer_plant_life_form(value):
    values = [v.strip() for v in value.split(';') if 'phyte' in v]    
    return ' ;'.join(values);


df['Raunkiaer plant life-form'] = df[df['life form'].notnull()]['life form'].apply(get_raunkiaer_plant_life_form)


columns_to_skip = ['habitat']

# Apply this to text columns    
df['life form'] = df[df['life form'].notnull()]['life form'].apply(dedupe)

dropped_terms = {}

for column in df.columns:
    if column in columns_to_skip: continue
    
    # if column != 'habit': continue
        
    # print(column)
    
    try:
        traits = traits_dict[column]
    except KeyError:
        # print(column)
        continue
    else:
        
        traits = set(itertools.chain(*[t.split('-') for t in traits]))  

        df[column] = df[df[column].notnull()][column].apply(dedupe)
        
        original_values = set(itertools.chain(*[v.lower().split(';') for v in df[column].dropna().unique()]))        
        original_values = set(map(str.strip, original_values))

        df[column] = df[df[column].notnull()][column].apply(limit_values_to_list, args=(traits,))
        
        new_values = set(itertools.chain(*[v.lower().split(';') for v in df[column].dropna().unique()]))        
        new_values = set(map(str.strip, new_values))
        
        dropped_terms[column] = original_values - new_values
        
measurement_cols = [col for col in df.columns if '[' in col] + ['max seed volume']

idx = df.columns.get_loc("plant height [m]")
df.insert(idx + 1, 'plant max height [m]', df['plant height [m]'].apply(to_max))
df.insert(idx + 1, 'plant min height [m]', df['plant height [m]'].apply(to_min))

for col in measurement_cols:
    df[col] = df[col].apply(to_mean)  
    

    
def columns_to_max(row, columns):    
    norm_values = [_flatten_numeric(v) for v in row[columns].dropna()]
    if norm_values:
        return np.array(list(itertools.chain(*norm_values))).max()

def columns_to_min(row, columns):    
    norm_values = [_flatten_numeric(v) for v in row[columns].dropna()]
    if norm_values:
        return np.array(list(itertools.chain(*norm_values))).min()
    
    
df['seeds max per fruit'] = df.apply(columns_to_max, args=(['seeds max per fruit', 'seeds min per fruit'],), axis=1)
df['seeds min per fruit'] = df.apply(columns_to_min, args=(['seeds max per fruit', 'seeds min per fruit'],), axis=1)

df['carpel/ovary number'] = df[df['carpel/ovary number'].notnull()]['carpel/ovary number'].apply(to_range)

    
df.to_csv('characters-corrected.csv')

In [164]:
dropped_terms_df = pd.DataFrame.from_dict(dropped_terms, orient='index')

dropped_terms_df = dropped_terms_df.transpose()

dropped_terms_df.to_csv('dropped_terms.csv')

# print(dropped_terms)

# with open('dropped_terms.csv', 'w') as csv_file:
#     writer = csv.writer(csv_file)
#     # writer.writerows(dropped_terms)

In [116]:
columns_to_skip = ['habitat', 'dispersal mode']

def limit_values_to_list(values, limit_to):
        
    try:
        values = {v.strip() for v in values.lower().split(';')}
    except AttributeError:
        print(values)
        raise
    else:        
        limited = [v for v in values if set(v.split('-')).intersection(limit_to)]
        return '; '.join(limited)

    
for column in df.columns:
    if column in columns_to_skip: continue
    
    try:
        traits = traits_dict[column]
    except KeyError:
        print(column)
        continue
        
    traits = set(itertools.chain(*[t.split('-') for t in traits]))

    df[column] = df[df[column].notnull()][column].apply(limit_values_to_list, args=(traits,))


taxon
plant height [m]
plant min height [m]
plant max height [m]
leaf defence
leaf min width [cm]
leaf max width [cm]
leaf min length [cm]
leaf max length [cm]
stamen count
stamen number
carpel/ovary number
heterostyly
dispersule min width [cm]
dispersule max width [cm]
dispersule min length [cm]
dispersule max length [cm]
seeds max per fruit
seeds min per fruit
seed min width [mm]
seed max width [mm]
seed min length [mm]
seed max length [mm]
max seed volume
ploidy
ploidy level
root system
root depth type
root depth [cm]
sources
Raunkiaer plant life-form


In [118]:
traits_dict['habit']

{'acaulescent',
 'acauline',
 'acaulous',
 'aerial-root',
 'aerial-rooted',
 'arboreous',
 'arborescent',
 'arial',
 'bark',
 'barks',
 'branch',
 'branches',
 'branching',
 'branchings',
 'branchy',
 'bush',
 'bushes',
 'bushlike',
 'bushy',
 'caespitose',
 'canopy',
 'climbers',
 'creeping',
 'culm',
 'culm-internode',
 'culms',
 'cushion',
 'cushion-forming',
 'cushions',
 'dwarf shrub',
 'erect-to-straggling',
 'grasslike',
 'haustoria',
 'haustorium',
 'herb',
 'herbaceous',
 'herbs',
 'liana',
 'lianas',
 'lianescent',
 'lianoid',
 'lianous',
 'palmlike',
 'palms',
 'rambling',
 'repent',
 'rhizome',
 'root-sprouting',
 'rootstock',
 'rootstocks',
 'rosette-forming',
 'rosette-like',
 'scandent',
 'scape',
 'scapelike',
 'scapes',
 'scapoid',
 'scapose',
 'scramblers',
 'scrambling',
 'semiprostrate',
 'semiscandent',
 'semispreading',
 'semiwoody',
 'shrub',
 'shrubby',
 'shrublet',
 'shrublike',
 'shrubs',
 'sprawling',
 'spreading-ascendant',
 'spreading-reflexing',
 'stem-lea

In [123]:
word = 'lous'
word.endswith('ous')

True