In [6]:
import re
import urllib.parse

def generate_wfo_search_url(scientific_name):
    """
    Constructs a World Flora Online search URL for a given plant name. 
    Args: search_term (str): The plant name to search for.
    Returns: str: A properly encoded URL string for World Flora Online search.
    """
    # Truncate anything after a closing parenthesis followed by author abbreviation
    match = re.match(r"^(.*\))", scientific_name)
    if match:
        truncated_name = match.group(1)
    else:
        truncated_name = scientific_name  # fallback

    query = urllib.parse.quote_plus(truncated_name.strip())
    url = f"https://www.worldfloraonline.org/search?query={query}&limit=24&start=0&sort="
    return url

In [7]:
term = "Sesuvium hydaspicum (Edgew.) Gon√É¬ß."
search_url = generate_wfo_search_url(term)
print(search_url)

https://www.worldfloraonline.org/search?query=Sesuvium+hydaspicum+%28Edgew.%29&limit=24&start=0&sort=


In [7]:
import pandas as pd

# Set the file path
file_path = "~/Downloads/classification.csv"

wfo_data = pd.read_csv(
    file_path,
    chunksize=10000,
    encoding='latin1',
    on_bad_lines='skip',
    engine='python',
    sep='\t',
    quoting=3,
)

wfo_data = pd.concat(wfo_data)
wfo_columns = wfo_data.columns.tolist()   ## load content into list
wfo_data.head()

Unnamed: 0,taxonID,scientificNameID,localID,scientificName,taxonRank,parentNameUsageID,scientificNameAuthorship,family,subfamily,tribe,...,acceptedNameUsageID,originalNameUsageID,nameAccordingToID,taxonRemarks,created,modified,references,source,majorGroup,tplID
0,wfo-0001302010,,9905237,"""Schoenoxiphium ecklonii var. ecklonii""",variety,,,Cyperaceae,,,...,,,,"""Source in seed data: tro More details could b...",2022-04-16,2023-05-29,,"""The Cyperaceae TEN""",A,tro-9905237
1,wfo-0001302011,,9905253,"""Cyperus violifolia""",species,,"""Rodriguez & Alfonso""",Cyperaceae,,,...,,,,"""Source in seed data: tro More details could b...",2022-04-16,2022-04-20,,"""The Cyperaceae TEN""",A,tro-9905253
2,wfo-0001302012,,9905297,"""Carex viridula var. viridula""",variety,,,Cyperaceae,,,...,,,,"""Source in seed data: tro More details could b...",2022-04-16,2023-05-29,,"""The Cyperaceae TEN""",A,tro-9905297
3,wfo-0001302013,urn:lsid:ipni.org:names:310980-1,9905332,"""Mariscus phleoides""",species,,Nees,Cyperaceae,,,...,,,,"""Source in seed data: tro Updated namePublishe...",2022-04-16,2024-06-04,,"""The Cyperaceae TEN""",A,tro-9905332
4,wfo-0001302014,urn:lsid:ipni.org:names:315251-1,9905359,"""Tetraria compar""",species,,"""P.Beauv. ex T.Lestib.""",Cyperaceae,,,...,,,,"""Source in seed data: tro Updated Author from ...",2022-04-16,2024-06-04,,"""The Cyperaceae TEN""",A,tro-9905359


In [9]:
mali_taxon = pd.read_csv("Mali_spp_list.csv")  # assumes a column 'taxon'
mali_taxon_list = mali_taxon['Taxon'].dropna().unique().tolist()

In [13]:
def fix_mojibake(name):
    try:
        return name.encode("latin1").decode("utf-8")
    except UnicodeEncodeError:
        return name  # Return unchanged if it doesn't trigger

In [14]:
mali_taxon_list_fixed = [fix_mojibake(name) for name in mali_taxon_list]

In [34]:
from pygnparser import gnparser

def parse_taxon_names(name_list):
    records = []
    for name in name_list:
        res = gnparser(name)
        records.append({
            "original": name,
            "scientific_name": res.canonical()["full"],
            "authorship": res.authorship()
        })
    return pd.DataFrame(records)

In [44]:
from tqdm import tqdm
import time
def parse_taxon_names_safe(name_list):
    records = []
    for name in tqdm(name_list):
        res = gnparser(name)
        records.append({
            "original": name,
            "scientific_name": res.canonical()["full"],
            "authorship": res.authorship()
        })
        time.sleep(0.1)  # avoid rate limit
    return pd.DataFrame(records)

In [45]:
mali_taxon_df = parse_taxon_names_safe(mali_taxon_list_fixed)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 329/329 [02:51<00:00,  1.92it/s]


In [49]:
mali_taxon_df.to_csv("mali/mali_taxon_list.csv")

In [52]:
from rapidfuzz import process, fuzz

def fuzzy_match_taxa(mali_df, wfo_df, threshold=90):
    # Create a copy of the original DataFrame
    matched_df = mali_df.copy()
    matched_taxon_ids = []

    # Combine name + authorship in WFO for matching
    wfo_tuples = list(zip(wfo_df['scientificName'], wfo_df['scientificNameAuthorship']))
    wfo_lookup = {f"{name} {auth}".strip(): tid for (name, auth), tid in zip(wfo_tuples, wfo_df['taxonID'])}
    wfo_keys = list(wfo_lookup.keys())

    # Add tqdm to the iteration
    for _, row in tqdm(mali_df.iterrows(), total=len(mali_df), desc="Matching taxa"):
        query = f"{row['scientific_name']} {row['authorship']}".strip()
        match, score, _ = process.extractOne(query, wfo_keys, scorer=fuzz.token_sort_ratio)

        if score >= threshold:
            matched_taxon_ids.append(wfo_lookup[match])
        else:
            matched_taxon_ids.append(None)

    matched_df['matched_taxonID'] = matched_taxon_ids
    return matched_df

In [53]:
result_df = fuzzy_match_taxa(mali_taxon_df, wfo_data, threshold=90)

Matching taxa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 329/329 [02:23<00:00,  2.29it/s]


In [60]:
def fuzzy_match_taxa_smart(mali_df, wfo_df, name_threshold=90, author_threshold=80):
    matched_df = mali_df.copy()

    taxon_ids = []
    name_scores = []
    author_scores = []
    match_methods = []

    # Build lookup: scientificName ‚Üí list of full rows
    wfo_name_map = {}
    for _, row in wfo_df.iterrows():
        name = row['scientificName']
        wfo_name_map.setdefault(name, []).append(row)

    wfo_name_keys = list(wfo_name_map.keys())

    for _, row in tqdm(mali_df.iterrows(), total=len(mali_df), desc="Matching taxa"):
        query_name = row['scientific_name']
        query_auth = row['authorship']

        # Fuzzy match scientific name
        matches = process.extract(query_name, wfo_name_keys, scorer=fuzz.token_sort_ratio, score_cutoff=name_threshold, limit=5)

        best_match = None
        best_score = 0
        best_auth_score = 0

        for name_candidate, name_score, _ in matches:
            for wfo_row in wfo_name_map[name_candidate]:
                wfo_auth = wfo_row.get('scientificNameAuthorship', '') or ''
                auth_score = fuzz.token_sort_ratio(query_auth or '', wfo_auth)

                if (
                    auth_score > best_auth_score or
                    (auth_score == best_auth_score and name_score > best_score)
                ):
                    best_match = wfo_row
                    best_score = name_score
                    best_auth_score = auth_score

        if best_match is not None:
            taxon_ids.append(best_match['taxonID'])
            name_scores.append(best_score)
            author_scores.append(best_auth_score)
            match_methods.append(
                "name+authorship" if best_auth_score >= author_threshold else "name_only"
            )
        else:
            taxon_ids.append(None)
            name_scores.append(None)
            author_scores.append(None)
            match_methods.append(None)

    matched_df['matched_taxonID'] = taxon_ids
    matched_df['name_match_score'] = name_scores
    matched_df['authorship_match_score'] = author_scores
    matched_df['match_method'] = match_methods

    return matched_df


In [61]:
result_df = fuzzy_match_taxa_smart(mali_taxon_df, wfo_data, name_threshold=90, author_threshold=80)

Matching taxa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 329/329 [01:39<00:00,  3.30it/s]


In [62]:
result_df

Unnamed: 0,original,scientific_name,authorship,matched_taxonID,name_match_score,authorship_match_score,match_method
0,Sesuvium hydaspicum (Edgew.) Gon√ß.,Sesuvium hydaspicum,(Edgew.) Gon√ß.,wfo-0000432990,95.000000,83.870968,name+authorship
1,Trianthema portulacastrum L.,Trianthema portulacastrum,L.,wfo-0000020781,96.153846,100.000000,name+authorship
2,Burnatia enneandra Micheli,Burnatia enneandra,Micheli,wfo-0000762556,94.736842,100.000000,name+authorship
3,Cyathula achyranthoides (Kunth) Moq.,Cyathula achyranthoides,(Kunth) Moq.,wfo-0000631165,95.833333,92.307692,name+authorship
4,Nothosaerva brachiata (L.) Wight,Nothosaerva brachiata,(L.) Wight,wfo-0000379913,95.454545,66.666667,name_only
...,...,...,...,...,...,...,...
324,Vangueria madagascariensis,Vangueria madagascariensis,,wfo-0000331269,96.296296,0.000000,name_only
325,Vernonia madagascariensis,Vernonia madagascariensis,,wfo-0000006018,96.153846,0.000000,name_only
326,Voacanga africana,Voacanga africana,,wfo-0000333604,94.444444,0.000000,name_only
327,Wolffiella welwitschii,Wolffiella welwitschii,,wfo-0000334490,95.652174,0.000000,name_only


In [65]:
result_df.to_csv("mali/mali_taxon_wfo_list.csv")

In [12]:
import requests
from bs4 import BeautifulSoup
import urllib3

# üîá Disable SSL warnings (for local, trusted scraping only)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_wfo_paragraphs(wfo_taxon_id):
    url = f"https://www.worldfloraonline.org/taxon/{wfo_taxon_id}"
    
    try:
        response = requests.get(url, verify=False, timeout=10)  # disable cert check
    except requests.exceptions.SSLError as e:
        print(f"SSL Error for {wfo_taxon_id}: {e}")
        return []
    except Exception as e:
        print(f"Request error for {wfo_taxon_id}: {e}")
        return []

    if response.status_code != 200:
        print(f"‚ùå Failed to retrieve page for {wfo_taxon_id}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    section_ids = ['general', 'morphology', 'habit', 'ecology', 'habitat', 'distribution', 'use']

    records = []
    for sec_id in section_ids:
        section_div = soup.find("div", id=sec_id)
        if section_div:
            desc_blocks = section_div.find_all("div", class_="description-with-citations")
            for block in desc_blocks:
                paragraphs = block.find_all("p", class_="justified")
                for para in paragraphs:
                    records.append({
                        "wfo_taxon_id": wfo_taxon_id,
                        "section": sec_id,
                        "text": para.get_text(strip=True)
                    })
    return records


In [93]:
rows = scrape_wfo_paragraphs("wfo-0000209242")
import pandas as pd
pd.DataFrame(rows)

Unnamed: 0,wfo_taxon_id,section,text
0,wfo-0000209242,general,"Herbs or shrublets, erect, to 1.5 m tall. Bran..."
1,wfo-0000209242,morphology,Tallo: planta(s) alt. (m) ; ramo(s) porte ; ra...
2,wfo-0000209242,morphology,"Plante \n\t\t\t Annuelle' √† suffrutescente, √©r..."
3,wfo-0000209242,morphology,"Herbeannuelle ou vivace,atteignant 2,7 m de ha..."
4,wfo-0000209242,habit,Subarbusto
5,wfo-0000209242,ecology,"savanes herbac√©es, endroits humides, bords de ..."
6,wfo-0000209242,habitat,Terr√≠cola
7,wfo-0000209242,habitat,"Savane herbeuse ou bois√©e, humide; marais et b..."
8,wfo-0000209242,distribution,"Afrique tropicale, bien r√©pandue, sauf les r√©g..."
9,wfo-0000209242,use,Les fleurs sont consomm√©es par les indig√®nes c...


In [14]:
rows = scrape_wfo_paragraphs("wfo-0000432990")
import pandas as pd
pd.DataFrame(rows)

Unnamed: 0,wfo_taxon_id,section,text
0,wfo-0000432990,morphology,Perianth segments united for c. 1/3‚Äì1/2 their ...


In [15]:
from langdetect import detect

temp_df = pd.DataFrame(rows)
for index, row in temp_df.iterrows():
    print(detect(row['text']))

en


In [16]:
import langid

for index, row in temp_df.iterrows():
    lang, conf = langid.classify(row['text'])
    print(f"{lang} ({conf:.2f})")

en (-1383.78)


In [17]:
import numpy as np
def detect_langid(text):
    # Get top two language predictions and scores
    ranked = langid.rank(text)
    top_lang, top_score = ranked[0]
    second_lang, second_score = ranked[1]

    # Convert to softmax percentage confidence
    scores = np.array([top_score, second_score])
    probs = np.exp(scores - np.max(scores))  # for numerical stability
    probs = probs / probs.sum()

    return top_lang, float(probs[0]) * 100  # return as percentage

In [115]:
for index, row in temp_df.iterrows():
    lang, conf = detect_langid(row['text'])
    print(f"{lang} ({conf:.2f})")

en (100.00)
pt (100.00)
fr (100.00)
fr (100.00)
en (65.74)
fr (100.00)
es (84.13)
fr (100.00)
fr (100.00)
fr (100.00)
fr (100.00)


In [26]:
import os

# Called functions
# - scrape_wfo_paragraphs(wfo_taxon_id)
# - detect_langid(text)

def process_and_save_wfo_data(result_df, output_dir="wfo_mali"):
    os.makedirs(output_dir, exist_ok=True)
    
    for i, row in tqdm(result_df.iterrows(), total=len(result_df), desc="Processing WFO taxa"):
        original = row['original']
        taxonID = row['matched_taxonID']

        try:
            scrapped_out = scrape_wfo_paragraphs(taxonID)
            temp_df = pd.DataFrame(scrapped_out)
            output_rows = []

            for _, desc_row in temp_df.iterrows():
                text = desc_row['text']
                section = desc_row['section']
                lang, conf = detect_langid(text)

                output_rows.append({
                    "original": original,
                    "wfo_id": taxonID,
                    "section": section,
                    "text": text,
                    "lang": lang,
                    "confidence": round(conf, 2)
                })

            output_df = pd.DataFrame(output_rows)
            output_path = os.path.join(output_dir, f"{taxonID}.csv")
            output_df.to_csv(output_path, index=False)

        except Exception as e:
            print(f"‚ùå Failed to process {original}: {e}")


In [121]:
process_and_save_wfo_data(result_df, output_dir="wfo_mali")

Processing WFO taxa:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                                          | 40/329 [00:36<03:54,  1.23it/s]

‚ùå Failed to retrieve page for wfo-0000611317


Processing WFO taxa:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                                                          | 41/329 [00:37<03:42,  1.29it/s]

‚ùå Failed to retrieve page for wfo-0000611439


Processing WFO taxa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 329/329 [05:24<00:00,  1.02it/s]


Cleome gynandra L. - wfo-0000611317

Cleome monophylla L. - wfo-0000611439

In [122]:
mali_match_list = result_df[['original', 'matched_taxonID']].copy()
mali_match_list = mali_match_list.rename(columns={
    'original': 'taxon',
    'matched_taxonID': 'wfo_id'
})
mali_match_list.to_csv("wfo_mali/mali_match_list.csv")

In [2]:
import pandas as pd

mali_taxon = pd.read_csv("WFOsynonym_Descriptions.csv")  # assumes a column 'taxon'

In [7]:
mali_taxon.iloc[0]['WFO link for synonym1']

'https://www.worldfloraonline.org/taxon/wfo-0000199192'

In [8]:
import re

def extract_wfo_ids_from_links(df):
    pattern = re.compile(r'wfo-\d+')
    
    for col in df.columns:
        if col.startswith("WFO link for synonym"):
            match = re.search(r'\d+', col)
            if match:
                n = match.group()
                new_col = f"wfo_synonym{n}_id"
                df[new_col] = df[col].apply(lambda x: pattern.search(x).group() if isinstance(x, str) and pattern.search(x) else None)
    
    return df

In [9]:
mali_taxon = extract_wfo_ids_from_links(mali_taxon)

In [18]:
new_mali_taxon = mali_taxon[['taxon','wfo_id','wfo_synonym1_id','wfo_synonym2_id','wfo_synonym3_id']].copy()

In [20]:
new_mali_taxon.columns

Index(['taxon', 'wfo_id', 'wfo_synonym1_id', 'wfo_synonym2_id',
       'wfo_synonym3_id'],
      dtype='object')

In [27]:
import os
import pandas as pd
from tqdm import tqdm

# Called functions must be defined elsewhere:
# - scrape_wfo_paragraphs(wfo_taxon_id)
# - detect_langid(text)

def process_and_save_wfo_data_synonym(mali_taxon_df, output_dir="wfo_mali_new"):
    os.makedirs(output_dir, exist_ok=True)
    
    for _, row in tqdm(mali_taxon_df.iterrows(), total=len(mali_taxon_df), desc="Processing WFO taxa"):
        taxon = row['taxon']
        primary_id = row['wfo_id']
        wfo_ids = [("wfo_id", primary_id)]
        
        for i in range(1, 4):
            colname = f"wfo_synonym{i}_id"
            wfo_id = row.get(colname)
            if pd.notna(wfo_id) and isinstance(wfo_id, str):
                wfo_ids.append((colname, wfo_id))
        
        output_rows = []

        for source_label, wfo_id in wfo_ids:
            try:
                scraped = scrape_wfo_paragraphs(wfo_id)
                if scraped is None or len(scraped) == 0:
                    continue

                for _, desc_row in pd.DataFrame(scraped).iterrows():
                    text = desc_row['text']
                    section = desc_row['section']
                    lang, conf = detect_langid(text)

                    output_rows.append({
                        "taxon": taxon,
                        "source": source_label,
                        "wfo_id": wfo_id,
                        "section": section,
                        "text": text,
                        "lang": lang,
                        "confidence": round(conf, 2)
                    })

            except Exception as e:
                print(f"‚ùå Failed to process {wfo_id} for taxon '{taxon}': {e}")

        if output_rows:
            output_df = pd.DataFrame(output_rows)
            safe_name = re.sub(r'[^\w\-]', '_', primary_id)
            output_path = os.path.join(output_dir, f"{safe_name}.csv")
            output_df.to_csv(output_path, index=False)


In [28]:
process_and_save_wfo_data_synonym(new_mali_taxon, output_dir="wfo_mali_new")

Processing WFO taxa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:20<00:00,  1.90s/it]
