In [115]:
from pathlib import Path
from adept.config import DATA_DIR, ROOT_DIR, logger, ASSETS_DIR

import pandas as pd
import re
import enum
import requests
import urllib.parse
import re
from adept.utils.soup import RequestSoup 
from adept.descriptions.efloras import EflorasDescriptionSource

In [116]:
data_dir = Path('./genus-species')

print(data_dir)

genus-species


In [159]:
file_name = "Susie's Master's species[36].csv"

df = pd.read_csv(data_dir / file_name)

In [118]:
df['genusName'].unique()

array(['Pyrus', 'Rosa', 'Urtica', 'Prunus', 'Malus', 'Amygdalus',
       'Cotoneaster', 'Crataegus', 'Sibiraea', 'Sorbaria', 'Armeniaca',
       'Cerasus', 'Ulmus', 'Sorbus', 'Celtis', 'Ficus', 'Micromeles',
       'Photinia', 'Eriobotrya'], dtype=object)

In [19]:
# df.group.unique()

# Liquidambar

# http://www.efloras.org/browse.aspx?flora_id=0&start_taxon_id=118701

array(['Angiosperm', 'Bryophyte', 'Pteridophyte', 'Gymnosperm', 'Alga'],
      dtype=object)

In [119]:
def parse_query_page(soup):

    search_results = {}

    title_span = soup.markup.find("span", {"id": "ucFloraTaxonList_lblListTitle"}) 

    if title_span.get_text(strip=True) == 'No taxa found':
        logger.info('No taxa found: %s', soup.parametised_url)
        return {}

    div = soup.markup.find("div", {"id": "ucFloraTaxonList_panelTaxonList"})        

    # We specify title="Accepted name" to exclude synonyms
    for a in div.find_all("a", href=re.compile("^florataxon"), title="Accepted Name"):        
        parsed_url = urllib.parse.urlparse(a.get('href'))
        qs = urllib.parse.parse_qs(parsed_url.query) 
        search_results[int(qs['flora_id'][0])] = int(qs['taxon_id'][0]) 

    return search_results

In [120]:
FLORA_OF_CHINA = 2  

url =  f'http://efloras.org/browse.aspx'

def search(taxon_name):
    soup = RequestSoup(url, flora_id=FLORA_OF_CHINA, name_str=taxon_name)
    results = parse_query_page(soup)
    return results.get(FLORA_OF_CHINA, None)



In [122]:
def parse_taxa(soup):
    
    div = soup.markup.find("div", {"id": "ucFloraTaxonList_panelTaxonList"}) 
    
    for a in div.find_all("a", href=re.compile("^florataxon"), title="Accepted Name"):
        yield a.text
        
def get_pages(soup):
    page_a = soup.markup.find_all('a', {'href': re.compile("page=\d")})    
    pages = set()
    for a in page_a:
        parsed_url = urllib.parse.urlparse(a.get('href'))
        qs = urllib.parse.parse_qs(parsed_url.query) 
        pages.add(int(qs['page'][0]))
    return pages

data = {}

for genus in df['genusName'].unique():
    print(f'PROCESSING {genus}')
    taxon_id = search(genus)
    if not taxon_id:
        print(f'GENUS NOT FOUND {genus}')
        continue
    
    soup = RequestSoup(url, flora_id=FLORA_OF_CHINA, start_taxon_id=taxon_id)
    taxa = parse_taxa(soup)

    data[genus] = list(taxa)

    if pages := get_pages(soup):
        for page in pages:
            print(f'PARSING PAGE {page}')
            soup = RequestSoup(url, flora_id=FLORA_OF_CHINA, start_taxon_id=taxon_id, page=page)
            taxa = parse_taxa(soup)
            data[genus] += list(taxa)

# for a in soup.markup.find_all("a", href=re.compile("^florataxon"), title="Accepted Name"):
#     print(a.text)


# http://www.efloras.org/browse.aspx?flora_id=0&start_taxon_id=118701

PROCESSING Pyrus
PROCESSING Rosa
PROCESSING Urtica
PROCESSING Prunus
PROCESSING Malus
PROCESSING Amygdalus
PROCESSING Cotoneaster
PROCESSING Crataegus
PROCESSING Sibiraea
PROCESSING Sorbaria
PROCESSING Armeniaca
PROCESSING Cerasus
PROCESSING Ulmus
PROCESSING Sorbus
PROCESSING Celtis
PROCESSING Ficus


INFO     No taxa found: http://efloras.org/browse.aspx?flora_id=2&name_str=Micromeles


PROCESSING Micromeles
GENUS NOT FOUND Micromeles
PROCESSING Photinia
PROCESSING Eriobotrya


In [123]:
flattened = []
for genus, taxa in data.items():
    for taxon in taxa:
        flattened.append({
            'Species name': taxon,
            'Genus': genus,
            'Major group': 'Angiosperm'
            
        })
        
out_df = pd.DataFrame(flattened)



In [124]:
new_file_name = '-'.join(file_name.split()[:-1] + ['genus-species.csv']).lower().replace("'", "")

out_df.to_csv(ASSETS_DIR / new_file_name, index=False)

'cindys-masters-genus-species.csv'

In [160]:
genus_df = df[['genusName', 'familyName']].drop_duplicates()

In [161]:
genus_df = genus_df.rename(columns={'genusName': 'Species name', 'familyName': 'family'})

In [162]:
genus_df['Major group'] = 'Angiosperm'

genus_df.head()

Unnamed: 0,Species name,family,Major group
0,Pyrus,ROSACEAE,Angiosperm
2,Rosa,ROSACEAE,Angiosperm
7,Urtica,URTICACEAE,Angiosperm
8,Prunus,ROSACEAE,Angiosperm
9,Malus,ROSACEAE,Angiosperm


In [163]:
new_file_name = '-'.join(file_name.split()[:-1] + ['genus.csv']).lower().replace("'", "")

print(new_file_name)

genus_df.to_csv(ASSETS_DIR / new_file_name, index=False)

susies-masters-genus.csv
