In [1]:
from adept.utils.soup import Soup, RequestSoup
from adept.utils.request import CachedRequest


from adept.config import CACHE_DIR
from adept.config import logger

import urllib.parse
import numpy as np
import itertools
import pandas as pd
import re

In [81]:
import pickle
import requests
from abc import ABCMeta, abstractmethod, ABC
from adept.traits.accdb import ACCDBTraits



    

class DescriptionSource(metaclass=ABCMeta):
    @abstractmethod
    def get_taxon_description(self, taxon_name):
        pass
    
    @property
    @abstractmethod
    def base_url(self):
        pass    

    
import enum 

class Enum(enum.EnumMeta): 
    def __contains__(cls, item): 
        return any(x.value == item for x in cls.__members__.values())
    
    
class EflorasDescriptionSource(DescriptionSource):
    base_url = 'http://efloras.org'
    
    class Floras(enum.Enum, metaclass=Enum):
        FLORA_OF_NORTH_AMERICA = 1
        FLORA_OF_CHINA = 2    
        MOSS_FLORA_OF_CHINA = 4 
        
    def __init__(self):
        super().__init__()  
        
        accdb = ACCDBTraits()
        self.terms = accdb.get_terms()   
    
    def get_taxon_description(self, taxon_name):
        
        results = self.search(taxon_name)
        
        descriptions = {}
        
        for flora_id, taxon_id in results:
            if flora_id not in self.Floras:
                continue
                
            if description:= self._parse_description(flora_id, taxon_id):
                
                print(flora_id)
                print(self.Floras.values)
                
                descriptions[flora_id] = description
                return description                        
    
    def search(self, taxon_name):
        # for each page, is it a volume we want
        url =  f'{self.base_url}/browse.aspx'
        try:
            soup = RequestSoup(url, flora_id=0, name_str=taxon_name)
        except RequestException:
            return None   
        
        return self._parse_query_page(soup)
    
    def _parse_query_page(self, soup):
        
        div = soup.markup.find("div", {"id": "ucFloraTaxonList_panelTaxonList"})

        table = div.find('table')

        header = [td.text for td in table.find_all("td", class_="header")]

        name_col_idx = header.index('Name')

        for tr in table.find_all('tr'):
            td = tr.find_all('td')[name_col_idx]
            if a := td.find('a'):
                parsed_url = urllib.parse.urlparse(a.get('href'))
                qs = urllib.parse.parse_qs(parsed_url.query) 

                # Tidy up query params {'flora_id': ['11'], 'taxon_id': ['200023010']} -> {flora_id: 11, taxon_id: xxxx}
                params = {k: int(v[0]) for k,v in qs.items()}
                
                yield tuple(params.values())
                

        
    def _parse_description(self, flora_id, taxon_id):
        url = f'{self.base_url}/florataxon.aspx'  
        try:      
            soup = RequestSoup(url, flora_id=flora_id, taxon_id=taxon_id)
        except Exception as e:
            logger.error('Requests exception: %s', e)
            return
        
        taxon_treatment = soup.markup.find('div', {'id': 'panelTaxonTreatment'})   
        
        if not taxon_treatment:
            logger.error('No taxon treatment: %s', soup.parametised_url) 
            return          
        
        p_text = [p.get_text(strip=True) for p in taxon_treatment.find_all('p') if p.get_text(strip=True) and not p.find('table')]
        
        descriptions = [p for p in p_text if self._is_plant_description(p)]
        
        if descriptions:
            print(descriptions)
            if len(descriptions) > 1:
                logger.debug('Multiple treatment paragraphs %s', soup.parametised_url)
                
            return '\n'.join(descriptions)
            
        
    def _is_plant_description(self, text):
        plant_part_words = self.words_in_string(self.terms, text) 
        if plant_part_words:     
            perentage = len(plant_part_words) / len(text.split())
            # If the percentage of parts is greater than 5% (for short descriptions)
            if perentage >= 0.05 or len(plant_part_words) >= 3:
                return True
            
    @staticmethod        
    def words_in_string(word_list, description):
        # Remove punctuation and split string into words
        words = re.sub("[^\w\s]", "", description).lower().split()

        return set(word_list).intersection(words)         

              
    
    
# wikipedia = WikipediaDescription()    

# wikipedia.get_taxon_description('Achillea millefolium')


# sources = [cls() for cls in DescriptionSource.__subclasses__()]

taxa = 'Achillea millefolium'

efloras = EflorasDescriptionSource()

results = efloras.get_taxon_description(taxa)

print(results)

# print(list(results))

# div = soup.markup.find("div", {"id": "ucFloraTaxonList_panelTaxonList"})

# table = div.find('table')

# header = [td.text for td in table.find_all("td", class_="header")]

# name_col_idx = header.index('Name')

# for tr in table.find_all('tr'):
#     td = tr.find_all('td')[name_col_idx]
#     if a := td.find('a'):
#         parsed_url = urllib.parse.urlparse(a.get('href'))
#         qs = urllib.parse.parse_qs(parsed_url.query) 
        
#         params = {k: int(v[0]) for k,v in qs.items()}
        
        
#         print(params)



# # header = table.fetch('a', {'title': re.compile('Accepted Name')})

# table.find_all('a', attrs={"title": "Accepted Name"})

# print(header)



# for taxon in taxa:
#     for source in sources:
#         if description := source.get_taxon_description(taxon):
#             print(description)
    

['Perennials,6–65+ cm (usually rhizomatous, sometimes stoloniferous).Stems1(–4), erect, simple or branched, densely lanate-tomentose to glabrate.Leavespetiolate (proximally) or sessile (distally, weakly clasping and gradually reduced); blades oblong or lanceolate, 3.5–35+ cm × 5–35 mm, 1–2-pinnately lobed (ultimate lobes ± lanceolate, often arrayed in multiple planes), faces glabrate to sparsely tomentose or densely lanate.Heads10–100+, in simple or compound, corymbiform arrays.Phyllaries20–30 in ± 3 series, (light green, midribs dark green to yellowish, margins green to light or dark brown) ovate to lanceolate, abaxial faces tomentose.Receptaclesconvex; paleae lanceolate, 1.5–4 mm.Ray florets(3–)5–8, pistillate, fertile; corollas white or light pink to deep purple, laminae 1.5–3 × 1.5–3 mm.Disc florets10–20; corollas white to grayish white, 2–4.5 mm.Cypselae1–2 mm (margins broadly winged).2n= 18, 27, 36, 45, 54, 63, 72 (including counts from Europe).', 'Morphologic characters that hav

AttributeError: values

In [71]:
descriptions

['EflorasDescriptionSource',
 'WikipediaDescriptionSource',
 'EcoflorasDescriptionSource']

In [177]:
df = pd.DataFrame([
    {'term': 'term1', 'termID': '1', 'g': '1', 'char1': 'c1', 'trait1': 't1', 'char2': 'c2', 'trait2': 't2'}, 
    {'term': 'term2', 'termID': '2', 'g': '1', 'char1': 'c1.2', 'trait1': 't1.2', 'char2': 'c2.2', 'trait2': 't2.2'},
    {'term': 'term3', 'termID': '3', 'g': '2', 'char1': 'c1.3', 'trait1': 't1.3', 'char2': 'c2.3', 'trait2': 't2.3'},
    # {'term': 'term3', 'termID': '3', 'char1': 'c1.4', 'trait1': 't1.4', 'char2': 'c2.4', 'trait2': 't2.4'},
])

# df = df.rename(columns={'char1': 'char', 'char2': 'char', 'trait1': 'trait', 'trait2': 'trait'})

# res = pd.DataFrame({
#         'char': df['char'].values.T.ravel(),
#         'trait': df['trait'].values.T.ravel(),
#         # 'term': df['term']
    
# })

# df.melt('term', value_name='key').drop('variable', 1)

pd.wide_to_long(df, ["char", "trait"], i="term", j="a")

# df = pd.melt(df, id_vars=["term", "termID"])

# df.head(n=100)

# pd.melt(df, id_vars=['term'], value_vars=['char', 'trait']).pivot(columns='variable', values=['term', 'value'])

# pd.melt(df, id_vars=['term', 'termID'], value_vars=['char', 'trait'])

# df.head()

# df.stack()

# df.groupby(['term', 'termID']).resetindex()

# pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')

# pd.wide_to_long(df, ['char', 'trait'], i='termID', j=['term'], sep='-')


Unnamed: 0_level_0,Unnamed: 1_level_0,termID,g,char,trait
term,a,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
term1,1,1,1,c1,t1
term2,1,2,1,c1.2,t1.2
term3,1,3,2,c1.3,t1.3
term1,2,1,1,c2,t2
term2,2,2,1,c2.2,t2.2
term3,2,3,2,c2.3,t2.3


In [174]:
# df.pivot_table('variable', index=['term', 'termID'], columns=['variable', 'value'])

# df.set_index(["term", "termID", "variable"], drop=True).unstack("variable")


# df.reset_index().groupby(['term', 'termID'])['variable'].aggregate('first').unstack()

df.pivot(columns='variable')['value']

variable,char,trait
0,c1,
1,c1.2,
2,c1.3,
3,,t1
4,,t1.2
5,,t1.3
6,c2,
7,c2.2,
8,c2.3,
9,,t2


In [119]:
res.head()

Unnamed: 0,char,trait
0,c1,t1
1,c1.2,t1.2
2,c1.3,t1.3
3,c2,t2
4,c2.2,t2.2


In [98]:
# df['c'] = df.apply()

df = pd.DataFrame([{'a': 1, 'c1': 1, 'c2': 2}, {'a': 2, 'c1': 3, 'c2': 4}, {'a': 3, 'c1': 3, 'c2': 4}])
df = df.rename(columns={'c2': 'c1'})

df.head()

# df.set_index(['a'])

# df = df.stack()

# df = df.reset_index()

Unnamed: 0,a,c1,c1.1
0,1,1,2
1,2,3,4
2,3,3,4


In [99]:
df.head()

Unnamed: 0,a,c1,c1.1
0,1,1,2
1,2,3,4
2,3,3,4


Unnamed: 0,a,variable,value
0,1,c1,1
1,2,c1,3
2,3,c1,3
3,1,c1,2
4,2,c1,4
5,3,c1,4


In [178]:
from adept.traits.accdb import ACCDBTraits

In [255]:
accdb = ACCDBTraits()

accdb._df['Plants Group'].unique()

# char_cols = [f'character{i}' for i in range(1, 5)]
# trait_cols = [f'trait{i}' for i in range(1, 5)]

# cols = char_cols + trait_cols + ['term', 'Plants Group']

# df2 = accdb._df[cols]

# import uuid
# df2['uuid'] = df2.apply(lambda _: uuid.uuid4(), axis=1)

array(['Angiosperms', 'Pteridophyte', nan, 'Gymnosperms', 'Bryophyte'],
      dtype=object)

In [258]:
df4 = df[:10]

df4['char1'] = 'a'

df4.head()

Unnamed: 0,term,termID,g,char1,trait1,char2,trait2
0,term1,1,1,a,t1,c2,t2
1,term2,2,1,a,t1.2,c2.2,t2.2
2,term3,3,2,a,t1.3,c2.3,t2.3


In [250]:
# import uuid
# df2['uuid'] = df2.apply(lambda _: uuid.uuid4(), axis=1)

df3 = pd.wide_to_long(df2, ["character", "trait"], i='uuid', j="x")

In [251]:
df3.head()
# df3[df3.duplicated()]

Unnamed: 0_level_0,Unnamed: 1_level_0,term,Plants Group,character,trait
uuid,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22ccb8d4-b040-440a-b7a0-8ae9cbd599da,1,palmate,Angiosperms,palmate,leaf architecture
a96e656c-32f5-4a03-b93d-13639727ff63,1,aerial_rooted,Angiosperms,climber/scrambler/tree,habit
0758066e-9b37-4c7d-8219-10f5fda6c017,1,agamospermous,Angiosperms,agamospermy,clonality
073d13f9-5d14-49db-9964-face85ffe462,1,aggregated,Angiosperms,schizocarp/mericarps,fruit type
3b1c7d2c-b973-40a1-bb25-55a7805f8908,1,amplexicaul,Angiosperms,amplexicaul,leaf architecture


In [252]:
df3[df3.term == 'androgynous'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term,Plants Group,character,trait
uuid,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
76203182-eab0-4d2e-8a26-326c01e59c61,1,androgynous,Angiosperms,monoecious,reproduction system
5b64c83b-d871-4c45-9a3b-512bf7fe650b,1,androgynous,,monoecious,reproduction system
ef816823-ac27-4bc5-910f-465e98bcb0a6,1,androgynous,,,
76203182-eab0-4d2e-8a26-326c01e59c61,2,androgynous,Angiosperms,bisexual,flower sex
5b64c83b-d871-4c45-9a3b-512bf7fe650b,2,androgynous,,bisexual,flower sex


In [233]:
df.head()

Unnamed: 0,term,termID,g,char1,trait1,char2,trait2
0,term1,1,1,c1,t1,c2,t2
1,term2,2,1,c1.2,t1.2,c2.2,t2.2
2,term3,3,2,c1.3,t1.3,c2.3,t2.3


In [85]:
import enum



class Floras(enum.Enum, metaclass=Enum):
    FLORA_OF_NORTH_AMERICA = 1
    FLORA_OF_CHINA = 2    
    MOSS_FLORA_OF_CHINA = 4 
    
    # def __contains__(cls, item): 
    #     return item in cls.__members__.values()     
    
Floras(1).name

'FLORA_OF_NORTH_AMERICA'

In [2]:
import re

In [50]:
t = 'Stems1(-4), 111erect, simple or branched, densely'

In [51]:
# t = re.sub(r'\s(?<number>\d+)', r'\g<number>', t)
# re.sub(r'\s([\w]+)([\d]+)', r' \1 \2', t)

re.sub("([a-z])([\d])", r"\1 \2", t)

'Stems 1(-4), 111erect, simple or branched, densely'

In [5]:
t

'Ray florets8-10(-13), styliferous and sterile'

In [90]:
t = 'Perennials,6-65+ cm (usually rhizomatous, sometimes stoloniferous). Stems1(-4), erect, simple or branched, densely lanate-tomentose to glabrate.Leavespetiolate (proximally) or sessile (distally, weakly clasping and gradually reduced); blades oblong or lanceolate, 3.5-35+ cm √ó 5-35 mm, 1-2-pinnately lobed (ultimate lobes ¬± lanceolate, often arrayed in multiple planes), faces glabrate to sparsely tomentose or densely lanate.Heads10-100+, in simple or compound, corymbiform arrays.Phyllaries20-30 in ¬± 3 series, (light green, midribs dark green to yellowish, margins green to light or dark brown) ovate to lanceolate, abaxial faces tomentose.Receptaclesconvex; paleae lanceolate, 1.5-4 mm.Ray florets(3-)5-8, pistillate, fertile; corollas white or light pink to deep purple, laminae 1.5-3 √ó 1.5-3 mm.Disc florets1 0-20; corollas white to grayish white, 2-4.5 mm.Cypselae1-2 mm (margins broadly winged).2n= 1 8, 2 7, 3 6, 4 5, 5 4, 6 3, 7 2 (including counts from Europe). Morphologic characters that have been used to segregate these populations into species and/or varieties include: (1) degree and persistence of tomentum; (2) phyllaries with greenish, light brown, or dark brown margins; (3) shapes of capitulescences (rounded or flat-topped); and (4) degrees of leaf dissection and shapes of lobes.'

In [91]:
# Add a space after a dot or comma, if it doesn't exist and is not part of a digit
t = re.sub("(?<=\D)([,.])(?=\S)", r"\1 ", t)

In [118]:
import spacy
from adept.preprocess import preprocessors

UnboundLocalError: local variable 't' referenced before assignment

In [93]:
nlp = spacy.load("en_core_web_trf")

In [114]:
t = 'Hello (there). Ben'

doc = nlp(t)

In [115]:
for sent in doc.sents:
    print(sent)

Hello (there). Ben


In [107]:
t[67:75]

''

In [104]:
ord(t[67])

46