In [2]:
import pandas as pd
import numpy as np
import itertools
from decimal import Decimal
import csv
import pandas_access as mdb
import pandas as pd
from collections import Counter

from abc import ABCMeta, abstractmethod, ABC
from typing import List, Set

from adept.config import RAW_DATA_DIR

In [55]:
accdb = RAW_DATA_DIR / 'Functional Traits_Working_File.accdb'

In [56]:
plant_groups = ['angiosperm', 'bryophyte', 'pteridophyte']

plants_group_col = 'Plants Group'

# for plant_group in plant_groups:
def get_plant_group_df(plant_group):
    table_name = f'{plant_group}_traits'                 
    df = mdb.read_table(accdb, table_name)
    
    df = df.applymap(lambda s:s.lower() if isinstance(s, str) else s)
    
    if plants_group_col not in df.columns:
        df[plants_group_col] = plant_group
        
    return df
        
dfs = [get_plant_group_df(plant_group) for plant_group in plant_groups]                
df = pd.concat(dfs, ignore_index=True)

gymnosperm = df[df['Plants Group'] == 'angiosperms'].copy()
gymnosperm['Plants Group'] = 'gymnosperm'

df.append(gymnosperm, ignore_index=True)
df = df.rename(columns={"Trait 4": "trait4"})




In [57]:
trait_list = pd.read_excel(RAW_DATA_DIR / 'functional-trait-list.xlsx', sheet_name='Angiosperm traits')

In [58]:

df = df.rename(columns={"Trait 4": "trait4"})

df.columns

Index(['term', 'category', 'character1', 'character3', 'character2', 'trait1',
       'trait2', 'trait3', 'hasSyn', 'sourceDataset', 'termID', 'Plants Group',
       'Comments', 'Traits AND/OR Terms', 'character4', 'trait4'],
      dtype='object')

In [59]:
synonyms_df = mdb.read_table(accdb, 'plant_glossary_synonyms')
# print(synonyms.head())

def get_synonyms(term):
    return list(synonyms_df[synonyms_df.term == term].synonym.unique())
        
syn = get_synonyms('berrylike')
print(syn)

['berry_like']


In [60]:
df = df[df['Plants Group'] == 'angiosperms']

trait_cols = [f'trait{i}' for i in range(1, 5)]
char_cols = [f'character{i}' for i in range(1, 5)]

# missing = [c for c in trait_list.columns if angiosperms.]
# print(missing)


# for col in trait_list.columns

In [61]:
list(zip(trait_cols, char_cols))

[('trait1', 'character1'),
 ('trait2', 'character2'),
 ('trait3', 'character3'),
 ('trait4', 'character4')]

In [140]:
# for idx, row in df.iterrows():
#     print(row)

# df.trait1.unique()

from dataclasses import dataclass, field

def normalise_text(text):
    
    missplellings = [
        ('polination', 'pollination'),
        ('pilosity_surface', 'pilosity - surface'),
        ('leaf apex\r\nleaf apex\trecurved', 'leaf apex'),
        ('fruit type\r\nfruit type\tfollicle', 'fruit type'),
        ('pernnial organ', 'perennial organ'),
        ('leas apex', 'leaf apex'),
        ('leas base', 'leaf base')        
        
    ]
    if type(text) == str:
        text = text.lower()
        for err, corr in missplellings:
            if text == err: text = corr
        
    return text
        
df[char_cols] = df[char_cols].applymap(normalise_text)
df[trait_cols] = df[trait_cols].applymap(normalise_text)


plant_parts = {'leaf', 'fruit', 'flower', 'petal', 'seed'}

class Collection(dict, metaclass=ABCMeta):
    
    @property
    @abstractmethod
    def cls(self):
        pass
    
    def __missing__(self, key):
        self[key] = self.cls(key)
        return self[key]  
    

@dataclass()
class Term:
    name: str
    synonyms: Set[str] = field(default_factory=set) 
    is_unique: bool = field(default=True)

class TermCollection(Collection):
    cls = Term    
    
@dataclass
class Character:
    name: str
    terms: TermCollection = field(default_factory=TermCollection)
    is_unique: str = field(default=None)
    # terms: List[Term] = field(default_factory=list)
    
    def add_term(self, term: str, synonyms: List[str] = None) -> None:
        term = self.terms[term]
        if synonyms: term.synonyms.update(synonyms)
        # self.terms.append(Term(term, synonyms))

class CharacterCollection(Collection):
    cls = Character          
        
@dataclass
class Trait:
    name: str
    characters: CharacterCollection = field(default_factory=CharacterCollection)    
  
    def add_character(self, char_name: str, term: str, synonyms: List[str]) -> None:        
        character = self.characters[char_name]
        character.add_term(term, synonyms)
        
        # If the characters has a / in the name, split and add as seperate terms  
        if '/' in char_name:
            for term in char_name.split('/'):
                character.add_term(term)
    
class TraitCollection(Collection):
    cls = Trait
  

traits = TraitCollection()


for row in df.itertuples(index=False):
    for trait_col, char_col in zip(trait_cols, char_cols):
        trait_name = getattr(row, trait_col)
        char_name = getattr(row, char_col) 
        
        if any(pd.isna(n) for n in [trait_name, char_name]):
            continue
            

            
            

        

        # if trait_name != 'fruit type':
        #     continue 
            
        synonyms = get_synonyms(row.term)
        trait = traits[trait_name]
        
        # TODO: Add structural part         
        
        trait.add_character(char_name, row.term, synonyms)
        
for trait in traits.values():        
    if plant_part := plant_parts.intersection(set(trait.name.split())):
        trait.plant_part = plant_part.pop()
        print(trait.plant_part)
        break

       

    # break


                
# term_names = [t.name for t in get_terms()]   

# counted = Counter(term_names)

# print(counted)
        
# print(s)
    
# TODO: Filter table on palmatipartite - see how chars and traits should be setup    
    
    
# trait_char_tuples = [(f'trait{i}', f'character{i}') for i in range(1, 4)]
# cols = list(sum(trait_char_tuples, ()))

# df[cols] = df[cols].applymap(normalise_text)

# df.groupby('trait2')['category'].apply(set)

# traits = set(itertools.chain.from_iterable([df[col] for col in trait_cols]))



# df['trait1'].head()

leaf


In [128]:
def get_trait_terms():
    trait_terms = {}
    for trait in traits.values():        
        trait_terms[trait.name] = {t.name for c in trait.characters.values() for t in c.terms.values()}
                
    return trait_terms

trait_terms = get_trait_terms()

        
for trait_name, terms in trait_terms.items():
    
    print('--')
    
    print(trait_name)
    
    other_terms = set().union(*[t for n, t in trait_terms.items() if n != trait_name])    
    overlapping_terms = terms.intersection(other_terms) 
    
    if overlapping_terms:
        for char in traits[trait_name].characters.values():
            for term in char.terms.values():
                if term.name in overlapping_terms:
                    term.is_unique = False

    

--
leaf architecture
Term(name='palmlike', synonyms=set(), is_unique=False)
Term(name='subauriculate_semiamplexicaul', synonyms=set(), is_unique=False)
Term(name='subamplexicaul', synonyms=set(), is_unique=False)
Term(name='asymmetric', synonyms=set(), is_unique=False)
Term(name='asymmetrical', synonyms=set(), is_unique=False)
Term(name='oblong_lenticellate', synonyms=set(), is_unique=False)
Term(name='repand_pinnatilobate', synonyms=set(), is_unique=False)
Term(name='scale', synonyms=set(), is_unique=False)
Term(name='runcinate_pinnatilobate', synonyms=set(), is_unique=False)
Term(name='runcinate_pinnatipartite', synonyms=set(), is_unique=False)
Term(name='cuneate_palmatisect', synonyms=set(), is_unique=False)
Term(name='involute_tubulose', synonyms=set(), is_unique=False)
Term(name='serrate_pinnatilobate', synonyms=set(), is_unique=False)
Term(name='peltate', synonyms=set(), is_unique=False)
{'involute_tubulose', 'asymmetrical', 'scale', 'cuneate_palmatisect', 'subauriculate_semiampl

In [228]:
df['hasSyn'].unique()

array([ 0.,  1., nan])

In [144]:
# import numpy as np    
# traits.remove(np.nan)

cols = {c.lower() for c in trait_list.columns}

# for col in cols:
#     if col in traits:
#         print(col)
#         break

# trait_list.columns

# traits - cols

gynoecium arrangement
leaf arrangement
life form
fruit colour
dispersal mode
flower shape
reproduction architecture
clonality
flower colour
leaf margin
flower architecture
fruit shape
flower sex
petal fusion
dispersion axillary
habit
reproduction system
leaf position
seed colour
leaf base
fruit type
leaf shape
flower merosity
fruit structure
spinescence
fruit dehiscence
leaf architecture
pollination
petal colour
perennial organ
indumentum
leaf apex
habitat
stamen arrangement
flower symmetry
succulence
inflorescence arrangement


In [41]:
df = pd.read_csv('/Users/ben/Projects/NaturalHistoryMuseum/ADEPT/adept/adept/corpus/descriptions.csv')

source_names = ['ecoflora', 'wikipedia']

# df[~df[source_names].isnull()].assign(any_na=1)

# df['any'] = pd.isnull(df[['ecoflora', 'wikipedia']]).any(1)

df['any'] = ~pd.isnull(df[source_names]).all(1)

df.replace(to_replace=False, value=np.NAN, inplace=True, method=None) 


# df[df['any'] == False].head(n=200)

# print(df[source_names + ['Species name or special variable', 'Major group', 'any']].groupby('Major group').count())   
 

# (df[['ecoflora', 'wikipedia']].isnull())

# df.head()

# print(df[source_names + ['Species name or special variable', 'Major group', 'any']].groupby('Major group').count())    

Unnamed: 0.1,Unnamed: 0,Major group,Family,Species name or special variable,ecoflora,wikipedia,any
14,14,Angiosperm,Rosaceae,Alchemilla vulgaris,,,False
15,15,Lichen,Parmeliaceae,Alectoria nigricans,,,False
16,16,Lichen,Parmeliaceae,Alectoria ochroleuca,,,False
23,23,Bryophyte,Anastrophyllaceae,Anastrepta orcadensis,,,False
24,24,Bryophyte,Anastrophyllaceae,Anastrophyllum donianum,,,False
...,...,...,...,...,...,...,...
581,581,Bryophyte,Sphagnaceae,Sphagnum strictum,,,False
582,582,Bryophyte,Sphagnaceae,Sphagnum subnitens,,,False
583,583,Bryophyte,Sphagnaceae,Sphagnum subsecundum,,,False
584,584,Bryophyte,Sphagnaceae,Sphagnum tenellum,,,False


In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,Major group,Family,Species name or special variable,ecoflora,wikipedia,any_na
0,0,Angiosperm,Compositae,Achillea millefolium,A strongly scented perennial herb with far-cre...,"\n\nAchillea millefolium is an erect, herbaceo...",False
1,1,Angiosperm,Compositae,Achillea ptarmica,"Infl a loose terminal corymb; Capitula few, 12...",,True
2,2,Angiosperm,Poaceae,Agrostis canina,A tufted and shortly rhizomatous or stolonifer...,"\nAgrostis canina is a perennial plant, with s...",False
3,3,Angiosperm,Poaceae,Agrostis capillaris,"A tufted, rhizomatous or shortly stoloniferous...",It forms a dense sward of fine leaves. The li...,False
4,4,Angiosperm,Poaceae,Agrostis curtisii,A densely tufted perennial 10-60 cm with numer...,,True
