# Assigning PFTs to TRY species

Explore the contents of the `TRY_Life_Forms` archive.

In [3]:
import os
from pathlib import Path

# Set working directory
os.chdir("../../data/try")
fp = Path("./try_life_forms_2025-09-26/0_raw/44089.txt")

Read the contents of `19233.txt` into memory (it's large, around 10GB)

In [None]:
import dask.dataframe as dd

dd.read_csv(fp, delimiter="\t", encoding="ISO-8859-1").columns

Index(['LastName', 'FirstName', 'DatasetID', 'Dataset', 'SpeciesName',
       'AccSpeciesID', 'AccSpeciesName', 'ObservationID', 'ObsDataID',
       'TraitID', 'TraitName', 'DataID', 'DataName', 'OriglName',
       'OrigValueStr', 'OrigUnitStr', 'ValueKindName', 'OrigUncertaintyStr',
       'UncertaintyName', 'Replicates', 'StdValue', 'UnitName',
       'RelUncertaintyPercent', 'OrigObsDataID', 'ErrorRisk', 'Reference',
       'Comment', 'StdValueStr', 'Unnamed: 28'],
      dtype='object')

In [5]:
import pandas as pd

pd.set_option("display.max_columns", None)

cols = {
    "AccSpeciesName": "string[pyarrow]",
    "TraitID": "uint32[pyarrow]",
    "OrigValueStr": "string[pyarrow]",
}
df = pd.read_csv(fp, delimiter="\t", encoding="ISO-8859-1", usecols=cols.keys()).astype(
    cols
)

df.head()

  df = pd.read_csv(fp, delimiter="\t", encoding="ISO-8859-1", usecols=cols.keys()).astype(


Unnamed: 0,AccSpeciesName,TraitID,OrigValueStr
0,Bartsia alpina,42,HEMI-PARASITE
1,Calamagrostis lapponica,42,GRAMINOID
2,Carex capitata,42,SEDGE
3,Carex rostrata,42,SEDGE
4,Carex saxatilis,42,SEDGE


In [6]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25974288 entries, 0 to 25974287
Data columns (total 3 columns):
 #   Column          Dtype          
---  ------          -----          
 0   AccSpeciesName  string         
 1   TraitID         uint32[pyarrow]
 2   OrigValueStr    string         
dtypes: string(2), uint32[pyarrow](1)
memory usage: 1.2 GB


Let's split out the traits into their own dataframes.

In [7]:
growth_form = (
    df.query("TraitID == 42")
    .drop(columns=["TraitID"])
    .dropna(subset=["OrigValueStr"])
    .rename(columns={"OrigValueStr": "X42"})
)
growth_form_simple = (
    df.query("TraitID == 3400")
    .drop(columns=["TraitID"])
    .dropna(subset=["OrigValueStr"])
    .rename(columns={"OrigValueStr": "X3400"})
)
pft = (
    df.query("TraitID == 197")
    .drop(columns=["TraitID"])
    .dropna(subset=["OrigValueStr"])
    .rename(columns={"OrigValueStr": "X197"})
)

Store the unique values for each dataframe so we can begin to compile a list of terms to match with.

In [11]:
print(f"growth_form: {growth_form.shape[0]:,}")
print(f"growth_form_simple: {growth_form_simple.shape[0]:,}")
print(f"pft: {pft.shape[0]:,}")


growth_form: 2,332,657
growth_form_simple: 213,372
pft: 100,337


Since `growth_form` (X42) has the most records, we should use that to try to get maximum coverage. We should also drop any duplicate entries by formatting the growth form strings in lower case and dropping duplicates.

In [13]:
growth_form_filt = growth_form.assign(X42=growth_form["X42"].str.lower()).drop_duplicates()
print(f"growth_form_filt: {growth_form_filt.shape[0]:,}")


growth_form_filt: 684,212


In [18]:
Path("../../tmp").mkdir(parents=True, exist_ok=True)
growth_form_filt.to_parquet("./try_growth_forms.parquet")

growth_form_species = growth_form_filt.drop(columns=["X42"]).drop_duplicates()
print(f"growth_form_species: {growth_form_species.shape[0]:,}")
growth_form_species.to_csv("../../tmp/try_growth_forms_species.csv", index=False)

growth_form_filt[["X42"]].drop_duplicates().to_csv("../../tmp/try_growth_forms.csv", index=False)

growth_form_species: 233,201


Define the search terms to encompass the different PFTs.

In [None]:
search_terms_tree = [
    # Basic tree terms
    "tree",
    "Tree",
    "TREE",
    "trees",
    "Trees",
    "t",
    "seedling",
    "seedlings",
    "sapling",
    "saplings",
    # Tree size/age
    "small tree",
    "Small_Tree",
    "small_tree",
    "large tree",
    "tree-like",
    "tre",
    "treen",
    # Canopy position
    "canopy tree",
    "canopy_tree",
    "top.canopy.tree",
    "top canopy",
    "mid canopy",
    "mid.canopy.tree",
    "midtree",
    "mid.canopy.tree",
    "understory tree",
    "understory",
    "midstory",
    # Treelet
    "treelet",
    "Tree/Treelet",
    "tree/treelet",
    # Conifer/gymnosperm
    "conifer",
    "conifers",
    "gymnosperm",
    "gymn",
    # Angiosperm
    "angiosperm",
    "angwood",
    # Palms
    "palm",
    "pam",
    "palm tree",
    "palmtree",
    "canopy_palm",
    "understory_palm",
    "palm (p)",
    "palm resp. palm",
    # Deciduous/evergreen
    "tree (deciduous)",
    "tree (evergreen)",
    "woody deciduous",
    "woody evergreen",
    "deciduous shrub or tree",
    "evergreen shrub or tree",
    "semi deciduous tree or shrub",
    # Ferns
    "tree fern",
    "treefern",
    "Tree_Fern",
    "arborescent_fern",
    "Tree_",
    "t (treefern)",
    # Woody
    "hardwood",
    "softwood",
    "Hardwood",
    "Softwood",
    "woody plant",
    "woody species",
    "woody plants",
    "woody",
    "w",
    # Mangrove
    "mangrove",
    "tree/mangrove/woody",
    "shrub/tree/mangrove/woody",
    # Hemiepiphytes
    "tree/hemiepiphyte",
    "hemiepiphyte",
    "hemiepiphite",
    "hemi-epiphyte",
    "hemi-epipjyte",
    "tree/hemiepiphyte/woody",
    # Tree forms/shapes
    "columnar",
    "conical",
    "pachicaul",
    # Crops
    "tree crop",
    # Spanish/other
    "arbol",
    # Tree combinations
    "tree/shrub",
    "tree / shrub",
    "tree shrub",
    "shrub/tree",
    "shrub / tree",
    "tree-shrub",
    "tree_shrub",
    "shrub_tree",
    "shrub_tree",
    "t/s",
    "t/tree",
    "tree/tree",
    "tree | shrub",
    "shrub | tree",
    "tree | tree",
    "tree|shrub",
    "tree shrub intermediate",
    "tree/large shrub",
    "tree/shrub/climber",
    "smtree",
    "tree/ /woody",
    "tree/woody",
    # TRY database patterns
    "trees/T/Tree",
    "Tree/Tree",
    "T/Tree",
    "T/tree/Tree",
    "tree/Tree",
    "trees/Shrub",
    "trees/T/tree/Tree",
    "trees/tree",
    "trees/tree/tree",
    "t/tree/tree",
    "Tree/Shrub",
    "trees/T",
    "trees/t",
    "trees/t/tree",
    # Roman numerals
    "Tree V",
    "Tree VII",
    "Tree IX",
    "Tree IV",
    "Tree III",
    "Tree II",
    "Tree I",
    # Abbreviations
    "tree (t)",
    "t resp. t",
    "a t",
    "st",
    "lt",
    "slt",
    # Forest types
    "USforestTrees",
]

search_terms_grass = [
    # Herb terms
    "herb",
    "Herb",
    "HERB",
    "herbs",
    "herb.",
    "herb (h)",
    "h",
    "herbaceous",
    "herbaceous monocot",
    "herbaceous dicot",
    "herbaceous dicotyl",
    "herbaceous monocotyl",
    "herbaceous forb",
    "herbaceous legume",
    "herbaceous plant",
    "herbaceous/terrestrial herb",
    "terrestrial herb",
    "terrestrial_herb",
    "angherb",
    "herb_erect",
    "hierba_",
    # Forb
    "forb",
    "forbs",
    "Forb",
    "forb/herb",
    "forbs",
    "frobs",
    "annual forb",
    "perennial forb",
    "variable forb",
    "leguminous forb",
    "forb-annual",
    "forb-biennial",
    # Graminoid/grass
    "graminoid",
    "Graminoid",
    "GRAMINOID",
    "graminoids",
    "gram",
    "gras",
    "grass",
    "Grass",
    "GRASS",
    "grasses",
    "g",
    "grass (poaceae only)",
    "c3 grass",
    "c4 grass",
    "c3.sedges",
    "grass (clonal)",
    "grass (tussock)",
    "tuss",
    "forage grass",
    "pasture grass",
    "prairie grass",
    "woody grass",
    "annual grass",
    "perennial grass",
    "Annual Grass",
    "Perennial Grass",
    "annual graminoid",
    "perennial graminoid",
    "variable graminoid",
    "graminoid-annual",
    "bunchgrasses",
    "rhizome grass",
    # Sedges
    "sedge",
    "SEDGE",
    "seges",
    # Grasses&sedges combined
    "Grasses&Sedges",
    "grasses&sedges",
    "g&s",
    "g&s resp. g&s",
    # Ferns
    "fern",
    "Fern",
    "FERN",
    "ferns",
    "fern ally",
    "fernally",
    "pteridophyte",
    "ferns and allies (lycophytes)",
    "fern or fern ally",
    "fern/non-woody",
    "fern/aquatic",
    "fern/aquatic/non-woody",
    "fern/woody",
    "fern/palmoid/woody",
    "fern/epiphyte/non-woody",
    "terrestrial_fern",
    "terrestrial fern",
    # Allies
    "clubmoss",
    "club moss",
    "horsetail",
    "lycopodiophyta",
    "selaginella",
    # Moss/bryophyte
    "moss",
    "turf moss",
    "bryophyte",
    "nonvascular",
    # Lichen
    "lichen",
    "lichenous",
    "lichen/non-woody",
    "foliose lichen",
    "fruticose lichen",
    # Life cycles
    "annual",
    "perennial",
    "therophyte",
    "hemicryptophyte",
    "annual-biennial",
    "perennial herb",
    "perennial herb/hemicryptophyte",
    "herbaceous perennial",
    "annual herb",
    "herbaceous annual",
    "herbaceous annual-biennial",
    "perennial graminoid",
    "perennial leguminous herb",
    "perennial grass/hemicryptophyte",
    # Geophyte
    "geophyte",
    "geop",
    "bulb",
    "rhizomatous",
    "rhiz",
    "rhizimatous",
    "rhizomatous/bulbs",
    "perennial, rhizomatous",
    # Bamboo
    "bamboo",
    # Aquatic herbs
    "aquatic",
    "aquatic forb",
    "hydrophyte",
    "hydrophytes",
    "waterplant",
    "aquatic fresh water",
    "aquatic, fresh water, floating",
    "submerged",
    "amphibiousubmerged",
    "emergent attached to the substrate",
    "submerged attached to the substrate",
    "floating leaves attached to the substrate",
    "free-floating plants",
    "herb/aquatic",
    "herb/aquatic/non-woody",
    "hydrophyte-annual",
    "hyd",
    "n hyd",
    # Rosette
    "rosette",
    "rosette plant",
    "semi-rosette",
    "rosette forb",
    # Rush
    "rush",
    # Grasslike
    "grasslike",
    # Legumes
    "legume",
    "legumes",
    # Cereals
    "cereal",
    # Weeds
    "weed",
    "weedy",
    "weed, sedge",
    # Grassland
    "Grassland",
    "grassland",
    # Abbreviations
    "herb resp. h",
    "b h",
    "hel",
    "m hel",
    "n",
    "a",
    "hs",
    "hl",
    "ha",
    "hsl",
    "hst",
    "hslt",
    "hsa",
    "el",
    # Crop
    "crops",
    "crop",
    # Other
    "extensive-stemmed herb",
    "small_herb_",
]

search_terms_shrub = [
    # Basic shrub terms
    "shrub",
    "Shrub",
    "SHRUB",
    "shrubs",
    "Shrubs",
    "s",
    "sh",
    "shru",
    "shrub (s)",
    "s resp. s",
    "srub",
    "shurb",
    "arbusto",
    # Subshrub
    "subshrub",
    "sub-shrub",
    "sub shrub",
    "subshrub (woody <1m)",
    "sub-shrub (chamaephyte)",
    "suffrutescent",
    "subshurb",
    # Dwarf shrub
    "dwarf shrub",
    "erect dwarf shrub",
    "prostrate dwarf shrub",
    "evergreen dwarf shrub",
    "drwarf shrub",
    "Dwarf Shrub community",
    # Shrub size
    "small shrub",
    "small_shrub",
    "large shrub",
    "low to high shrub",
    # Arborescent
    "arborescent shrubs",
    # Chamaephyte
    "chamaephyte",
    "chaemaephyte",
    "nano-chamaephyte",
    "shrub (chamaephyte)",
    "chaemaephyte | shrub",
    "shrub | chaemaephyte",
    "chaemaephyte | nano-chamaephyte",
    "chaemaephyte | vine",
    # Woody shrub
    "woody shrub",
    "shrub/woody",
    "shrub (woody 1-4m)",
    # Mallee
    "mallee",
    # Vine terms
    "vine",
    "Vine",
    "VINE",
    "v",
    "vines",
    "woody vine",
    "herbaceous vine",
    "liana/woody vine",
    "liana/woody vine|shrub",
    "climbing vine",
    "climbing_vine",
    "scandent_vine",
    "trailing_vine",
    "herbaceous vine|herb",
    "vine|herb",
    "herbaceous vine|liana/woody vine",
    "vine resp. climb",
    "shrub/vine",  # FIXED: Added missing comma
    "w climb resp. v",
    "climb resp. v",
    "climb resp. climb",
    # Liana
    "liana",
    "Liana",
    "LIANA",
    "lianas",
    "l",
    "lian",
    "lianna",
    "woody liana",
    "l/woody liana",
    "lianas/lianna/woody liana",
    "lianas/woody liana",
    "lianas and climbers",
    "lianas/climber",
    "lianas/lianna",
    "lianas (wody climbers)",
    # Climber/creeper
    "climber",
    "climb",
    "climbing",
    "climber or creeper",
    "creeper",
    "climber/vine",
    "climber/liana",
    "climber/non-woody",
    "climber/woody",
    "climber/non-woody/woody",
    "twiner/climber.",
    "twiner/climber",
    # Epiphyte
    "epiphyte",
    "epiphytic",
    "epiphytes",
    "ep",
    "e",
    "epiphyte (e)",
    "epiphyte (mistletoe)",
    "epiphytic herb",
    "epiphytic_herb",
    "epiphyte_herb",
    "hemiepiphyte",
    "hemiepiphytes",
    "hemi-epiphyte",
    "epiphyte/hemiepiphyte",
    # Parasite
    "parasite",
    "hemi-parasite",
    "hemiparasite",
    "hemi-parasitic",
    "stem parasite",
    "root parasite",
    "woody parasite",
    "parasitic climber",
    "holoparasitic",
    "mistletoe",
    "parasite_epiphyte",
    # Succulent
    "succulent",
    "succulents",
    "succulent leaves",
    "succulent stems",
    "stem succulent",
    "leaf succulent",
    "rosette leaf succulent",
    "tall stem succulent",
    "stem and leaf succulent",
    "forb-succulent",
    "l succ",
    "i succ",
    # Cactus
    "cactus",
    "cacti",
    "cact",
    "cacti (c)",
    "agaves&cacti",
    "agaves&cacti resp. agaves&cacti",
    # Carnivore
    "carnivore",
    "carnivorous",
    "carnivorous plant",
    "carnivorous plant resp. carnivorous plant",
    # Cushion
    "cushion",
    "cushion plant",
    "cushion forming",
    # Mat forming
    "mat forming",
    "mat-forming",
    # Aquatic
    "aquatic",
    "Shrub/Aquatic",
    "shrub/aquatic",
    "aquatic plants, submerged",
    "aquatic plants, floating",
    "semi-aquatic",
    "subaquatic",
    # Growth forms
    "stem erect",
    "erect",
    "stem ascending to prostrate",
    "ascending",
    "stem prostrate",
    "prostrate",
    "decumbent",
    "trailing",
    "semi-erect",
    "always climbing using tendrils",
    "always trailing",
    "always spread climbing",
    "always climbing using adhesive roots",
    "sometimes spread climbing",
    "trailing_herb",
    "trailing_herbaceous_vine",
    "trailing_plant",
    "terrestrial_trailing_plant",
    # Thicket
    "thicket forming",
    # Branching
    "bunch",
    "colonizing",
    "multiple stems",
    "multi-stem",
    "multiple stem",
    "single stem",
    "single stem or multi stemed",
    "branchy",
    # Rosette
    "ros",
    "rosette",
    # Scapose
    "scap",
    "scapose",
    # Caespitose
    "caesp",
    "caespitose",
    # Reptant
    "rept",
    "reptant",
    # Stoloniferous
    "stoloniferous",
    # Rhizomatous
    "rhizomatous",
    "rhiz",
    # Shrub combinations
    "shrub/tree",
    "tree/shrub",
    "tree / shrub",
    "shrub / tree",
    "shrub_tree",
    "tree_shrub",
    "shrub/tree intermediate",
    "shrub | tree",
    "tree | shrub",
    "shrub|tree",
    "tree|shrub",
    "Terrestrial_Shrub",
    "terrestrial_shrub",
    # Shrub land
    "Shrubland",
    "shrubland",
    "Shrub forest belt",
    # Shrub, Subshrub
    "Shrub, Subshrub",
    "shrubs and sub-shrubs",
    # Shrub/Parasite
    "Shrub/Parasite",
    # Abbreviations
    "halfshrubs resp. s",
    "shrub-like_clumps",
    "shrub-like_herb",
    "se",
    "rus",
    "various",
    # Desert
    "desert sub-shrubs",
    # Free
    "free",
    "free-standing",
]

Convert all search terms to lowercase and remove redundant terms.

In [23]:
search_terms_tree = set([t.lower() for t in search_terms_tree])
search_terms_grass = set([t.lower() for t in search_terms_grass])
search_terms_shrub = set([t.lower() for t in search_terms_shrub])

Add new standardized PFT column to `growth_form_filt`.

In [24]:
# Replace values in X42 column
growth_form_filt.loc[growth_form_filt["X42"].isin(search_terms_tree), "pft"] = "Tree"
growth_form_filt.loc[growth_form_filt["X42"].isin(search_terms_grass), "pft"] = "Grass"
growth_form_filt.loc[growth_form_filt["X42"].isin(search_terms_shrub), "pft"] = "Shrub"

In [25]:
growth_form_filt.head()

Unnamed: 0,AccSpeciesName,X42,pft
0,Bartsia alpina,hemi-parasite,Shrub
1,Calamagrostis lapponica,graminoid,Grass
2,Carex capitata,sedge,Grass
3,Carex rostrata,sedge,Grass
4,Carex saxatilis,sedge,Grass


In [29]:
unmatched = growth_form_filt[growth_form_filt["pft"].isna()]
unmatched_count = unmatched.shape[0]
total_count = growth_form_filt.shape[0]
print(
    f"Unmatched rows: {(unmatched_count / total_count):.2%} ({unmatched_count:,} of {total_count:,} rows)"
)

Unmatched rows: 24.02% (164,354 of 684,212 rows)


In [32]:
unmatched[["X42"]].to_csv("../../tmp/try_growth_forms_unmatched.csv", index=False)

### Try enhanced matching approach

In [None]:
import logging
import re

def assign_pft_priority(growth_form: str, search_terms: dict) -> str | None:
    """
    Assign PFT using priority-based matching.

    Priority order: Tree > Shrub > Grass

    For compound forms like "herb_tree_shrub", this will:
    1. Split by common delimiters (_, /, |, comma, space)
    2. Check each token against search terms
    3. Return the highest priority match
    """
    if pd.isna(growth_form):
        return None

    # Filter out non-growth forms
    non_growth_forms = {
        "yes",
        "no",
        "absence",
        "presence",
        "unspecified",
        "?",
        "`",
        "rounded",
        "conical",
        "oval",
        "vase",
        "irregular",
        "single crown",
        "soli",
        "mult",
        "ss",
        "c+sc",
        "nd",
        "various",
    }

    # Check if it's a number or clearly not a growth form
    if growth_form in non_growth_forms:
        return None

    # Try to parse as float - if successful, it's not a growth form
    try:
        float(growth_form)
        return None
    except ValueError:
        pass

    # First try exact match (already lowercase)
    for pft, terms in search_terms.items():
        if growth_form in terms:
            return pft

    # Clean and tokenize the growth form
    # Split by common delimiters: underscore, slash, pipe, comma, space
    tokens = re.split(r"[_/|,\s]+", growth_form.strip('"').strip())

    # Priority order
    priority_order = ["Tree", "Shrub", "Grass"]

    # Check each token against search terms
    found_pfts = set()
    for token in tokens:
        token = token.strip()
        if not token:
            continue
        for pft, terms in search_terms.items():
            if token in terms:
                found_pfts.add(pft)
                break

    # Return highest priority match
    for pft in priority_order:
        if pft in found_pfts:
            return pft

    # If no match, try substring matching (contains)
    for pft in priority_order:
        for term in search_terms[pft]:
            if term in growth_form and len(term) > 2:  # avoid short false positives
                return pft

    return None


log = logging.getLogger(__name__)

log.info("Reading data...")

df = growth_form_filt.copy()

log.info("Processing data...")
df = df.assign(AccSpeciesName=lambda _df: _df["AccSpeciesName"].str.lower())

# Define the search terms (updated with additional terms)
search_terms_tree = [
    # Basic tree terms
    "tree",
    "Tree",
    "TREE",
    "trees",
    "Trees",
    "t",
    "seedling",
    "seedlings",
    "sapling",
    "saplings",
    # Tree size/age
    "small tree",
    "Small_Tree",
    "small_tree",
    "large tree",
    "tree-like",
    "tre",
    "treen",
    # Canopy position
    "canopy tree",
    "canopy_tree",
    "top.canopy.tree",
    "top canopy",
    "mid canopy",
    "mid.canopy.tree",
    "midtree",
    "mid.canopy.tree",
    "understory tree",
    "understory",
    "midstory",
    # Treelet
    "treelet",
    "Tree/Treelet",
    "tree/treelet",
    # Conifer/gymnosperm
    "conifer",
    "conifers",
    "gymnosperm",
    "gymn",
    # Angiosperm
    "angiosperm",
    "angwood",
    # Palms
    "palm",
    "pam",
    "palm tree",
    "palmtree",
    "canopy_palm",
    "understory_palm",
    "palm (p)",
    "palm resp. palm",
    # Deciduous/evergreen
    "tree (deciduous)",
    "tree (evergreen)",
    "woody deciduous",
    "woody evergreen",
    "deciduous shrub or tree",
    "evergreen shrub or tree",
    "semi deciduous tree or shrub",
    # Ferns
    "tree fern",
    "treefern",
    "Tree_Fern",
    "arborescent_fern",
    "Tree_",
    "t (treefern)",
    # Woody
    "hardwood",
    "softwood",
    "Hardwood",
    "Softwood",
    "woody plant",
    "woody species",
    "woody plants",
    "woody",
    "w",
    # Mangrove
    "mangrove",
    "tree/mangrove/woody",
    "shrub/tree/mangrove/woody",
    # Hemiepiphytes
    "tree/hemiepiphyte",
    "hemiepiphyte",
    "hemiepiphite",
    "hemi-epiphyte",
    "hemi-epipjyte",
    "tree/hemiepiphyte/woody",
    # Tree forms/shapes
    "columnar",
    "conical",
    "pachicaul",
    # Crops
    "tree crop",
    # Spanish/other
    "arbol",
    # Tree combinations
    "tree/shrub",
    "tree / shrub",
    "tree shrub",
    "shrub/tree",
    "shrub / tree",
    "tree-shrub",
    "tree_shrub",
    "shrub_tree",
    "t/s",
    "t/tree",
    "tree/tree",
    "tree | shrub",
    "shrub | tree",
    "tree | tree",
    "tree|shrub",
    "tree shrub intermediate",
    "tree/large shrub",
    "tree/shrub/climber",
    "smtree",
    "tree/ /woody",
    "tree/woody",
    # TRY database patterns
    "trees/T/Tree",
    "Tree/Tree",
    "T/Tree",
    "T/tree/Tree",
    "tree/Tree",
    "trees/Shrub",
    "trees/T/tree/Tree",
    "trees/tree",
    "trees/tree/tree",
    "t/tree/tree",
    "Tree/Shrub",
    "trees/T",
    "trees/t",
    "trees/t/tree",
    # Roman numerals
    "Tree V",
    "Tree VII",
    "Tree IX",
    "Tree IV",
    "Tree III",
    "Tree II",
    "Tree I",
    # Abbreviations
    "tree (t)",
    "t resp. t",
    "a t",
    "st",
    "lt",
    "slt",
    # Forest types
    "USforestTrees",
    # Successional stages
    "early-successional",
    "mid-successional",
    "late-successional",
]

search_terms_grass = [
    # Herb terms
    "herb",
    "Herb",
    "HERB",
    "herbs",
    "herb.",
    "herb (h)",
    "h",
    "herbaceous",
    "herbaceous monocot",
    "herbaceous dicot",
    "herbaceous dicotyl",
    "herbaceous monocotyl",
    "herbaceous forb",
    "herbaceous legume",
    "herbaceous plant",
    "herbaceous/terrestrial herb",
    "terrestrial herb",
    "terrestrial_herb",
    "angherb",
    "herb_erect",
    "hierba_",
    # Forb
    "forb",
    "forbs",
    "Forb",
    "forb/herb",
    "frobs",
    "annual forb",
    "perennial forb",
    "variable forb",
    "leguminous forb",
    "forb-annual",
    "forb-biennial",
    # Graminoid/grass
    "graminoid",
    "Graminoid",
    "GRAMINOID",
    "graminoids",
    "gram",
    "gras",
    "grass",
    "Grass",
    "GRASS",
    "grasses",
    "g",
    "grass (poaceae only)",
    "c3 grass",
    "c4 grass",
    "c3.sedges",
    "grass (clonal)",
    "grass (tussock)",
    "tuss",
    "forage grass",
    "pasture grass",
    "prairie grass",
    "woody grass",
    "annual grass",
    "perennial grass",
    "Annual Grass",
    "Perennial Grass",
    "annual graminoid",
    "perennial graminoid",
    "variable graminoid",
    "graminoid-annual",
    "bunchgrasses",
    "rhizome grass",
    # Sedges
    "sedge",
    "SEDGE",
    "seges",
    # Grasses&sedges combined
    "Grasses&Sedges",
    "grasses&sedges",
    "g&s",
    "g&s resp. g&s",
    # Ferns
    "fern",
    "Fern",
    "FERN",
    "ferns",
    "fern ally",
    "fernally",
    "pteridophyte",
    "ferns and allies (lycophytes)",
    "fern or fern ally",
    "fern/non-woody",
    "fern/aquatic",
    "fern/aquatic/non-woody",
    "fern/woody",
    "fern/palmoid/woody",
    "fern/epiphyte/non-woody",
    "terrestrial_fern",
    "terrestrial fern",
    # Allies
    "clubmoss",
    "club moss",
    "horsetail",
    "lycopodiophyta",
    "selaginella",
    # Moss/bryophyte
    "moss",
    "turf moss",
    "bryophyte",
    "nonvascular",
    # Lichen
    "lichen",
    "lichenous",
    "lichen/non-woody",
    "foliose lichen",
    "fruticose lichen",
    # Life cycles
    "annual",
    "perennial",
    "therophyte",
    "hemicryptophyte",
    "annual-biennial",
    "perennial herb",
    "perennial herb/hemicryptophyte",
    "herbaceous perennial",
    "annual herb",
    "herbaceous annual",
    "herbaceous annual-biennial",
    "perennial graminoid",
    "perennial leguminous herb",
    "perennial grass/hemicryptophyte",
    # Geophyte
    "geophyte",
    "geop",
    "bulb",
    "rhizomatous",
    "rhiz",
    "rhizimatous",
    "rhizomatous/bulbs",
    "perennial, rhizomatous",
    # Bamboo
    "bamboo",
    # Aquatic herbs
    "aquatic",
    "aquatic forb",
    "hydrophyte",
    "hydrophytes",
    "waterplant",
    "aquatic fresh water",
    "aquatic, fresh water, floating",
    "submerged",
    "amphibiousubmerged",
    "emergent attached to the substrate",
    "submerged attached to the substrate",
    "floating leaves attached to the substrate",
    "free-floating plants",
    "herb/aquatic",
    "herb/aquatic/non-woody",
    "hydrophyte-annual",
    "hyd",
    "n hyd",
    "macrophyte",
    "aquativ",  # Added macrophyte
    # Rosette
    "rosette",
    "rosette plant",
    "semi-rosette",
    "rosette forb",
    # Rush
    "rush",
    # Grasslike
    "grasslike",
    # Legumes
    "legume",
    "legumes",
    "annual legume",
    "perennial legume",  # Added annual/perennial legume
    # Cereals
    "cereal",
    # Weeds
    "weed",
    "weedy",
    "weed, sedge",
    # Grassland
    "Grassland",
    "grassland",
    # Abbreviations
    "herb resp. h",
    "b h",
    "hel",
    "m hel",
    "n",
    "a",
    "hs",
    "hl",
    "ha",
    "hsl",
    "hst",
    "hslt",
    "hsa",
    "el",
    # Crop
    "crops",
    "crop",
    # Other
    "extensive-stemmed herb",
    "small_herb_",
    "annuals",  # Added
]

search_terms_shrub = [
    # Basic shrub terms
    "shrub",
    "Shrub",
    "SHRUB",
    "shrubs",
    "Shrubs",
    "s",
    "sh",
    "shru",
    "shrub (s)",
    "s resp. s",
    "srub",
    "shurb",
    "arbusto",
    # Subshrub
    "subshrub",
    "sub-shrub",
    "sub shrub",
    "subshrub (woody <1m)",
    "sub-shrub (chamaephyte)",
    "suffrutescent",
    "subshurb",
    # Dwarf shrub
    "dwarf shrub",
    "erect dwarf shrub",
    "prostrate dwarf shrub",
    "evergreen dwarf shrub",
    "drwarf shrub",
    "Dwarf Shrub community",
    "dwarf semishrub",  # Added
    # Shrub size
    "small shrub",
    "small_shrub",
    "large shrub",
    "low to high shrub",
    # Arborescent
    "arborescent shrubs",
    # Chamaephyte
    "chamaephyte",
    "chaemaephyte",
    "nano-chamaephyte",
    "shrub (chamaephyte)",
    "chaemaephyte | shrub",
    "shrub | chaemaephyte",
    "chaemaephyte | nano-chamaephyte",
    "chaemaephyte | vine",
    "chasmophyte",  # Added chasmophyte
    # Woody shrub
    "woody shrub",
    "shrub/woody",
    "shrub (woody 1-4m)",
    # Mallee
    "mallee",
    # Vine terms
    "vine",
    "Vine",
    "VINE",
    "v",
    "vines",
    "woody vine",
    "herbaceous vine",
    "liana/woody vine",
    "liana/woody vine|shrub",
    "climbing vine",
    "climbing_vine",
    "scandent_vine",
    "trailing_vine",
    "herbaceous vine|herb",
    "vine|herb",
    "herbaceous vine|liana/woody vine",
    "vine resp. climb",
    "shrub/vine",  # FIXED - added this term that was missing due to syntax error
    "w climb resp. v",
    "climb resp. v",
    "climb resp. climb",
    # Liana
    "liana",
    "Liana",
    "LIANA",
    "lianas",
    "l",
    "lian",
    "lianna",
    "woody liana",
    "l/woody liana",
    "lianas/lianna/woody liana",
    "lianas/woody liana",
    "lianas and climbers",
    "lianas/climber",
    "lianas/lianna",
    "lianas (wody climbers)",
    # Climber/creeper
    "climber",
    "climb",
    "climbing",
    "climber or creeper",
    "creeper",
    "climber/vine",
    "climber/liana",
    "climber/non-woody",
    "climber/woody",
    "climber/non-woody/woody",
    "twiner/climber.",
    "twiner/climber",
    "climber/palmoid/woody",  # Added
    # Epiphyte
    "epiphyte",
    "epiphytic",
    "epiphytes",
    "ep",
    "e",
    "epiphyte (e)",
    "epiphyte (mistletoe)",
    "epiphytic herb",
    "epiphytic_herb",
    "epiphyte_herb",
    "hemiepiphyte",
    "hemiepiphytes",
    "hemi-epiphyte",
    "epiphyte/hemiepiphyte",
    # Parasite
    "parasite",
    "hemi-parasite",
    "hemiparasite",
    "hemi-parasitic",
    "stem parasite",
    "root parasite",
    "woody parasite",
    "parasitic climber",
    "holoparasitic",
    "mistletoe",
    "parasite_epiphyte",
    # Succulent
    "succulent",
    "succulents",
    "succulent leaves",
    "succulent stems",
    "stem succulent",
    "leaf succulent",
    "rosette leaf succulent",
    "tall stem succulent",
    "stem and leaf succulent",
    "forb-succulent",
    "l succ",
    "i succ",
    "caudiciform",  # Added
    # Cactus
    "cactus",
    "cacti",
    "cact",
    "cacti (c)",
    "agaves&cacti",
    "agaves&cacti resp. agaves&cacti",
    # Carnivore
    "carnivore",
    "carnivorous",
    "carnivorous plant",
    "carnivorous plant resp. carnivorous plant",
    # Cushion
    "cushion",
    "cushion plant",
    "cushion forming",
    # Mat forming
    "mat forming",
    "mat-forming",
    # Aquatic
    "aquatic",
    "Shrub/Aquatic",
    "shrub/aquatic",
    "aquatic plants, submerged",
    "aquatic plants, floating",
    "semi-aquatic",
    "subaquatic",
    "aquatic/semi-aquatic",  # Added
    # Growth forms
    "stem erect",
    "erect",
    "stem ascending to prostrate",
    "ascending",
    "stem prostrate",
    "prostrate",
    "decumbent",
    "trailing",
    "semi-erect",
    "always climbing using tendrils",
    "always trailing",
    "always spread climbing",
    "always climbing using adhesive roots",
    "sometimes spread climbing",
    "trailing_herb",
    "trailing_herbaceous_vine",
    "trailing_plant",
    "terrestrial_trailing_plant",
    # Thicket
    "thicket forming",
    # Branching
    "bunch",
    "colonizing",
    "multiple stems",
    "multi-stem",
    "multiple stem",
    "single stem",
    "single stem or multi stemed",
    "branchy",
    # Rosette
    "ros",
    "rosette",
    # Scapose
    "scap",
    "scapose",
    # Caespitose
    "caesp",
    "caespitose",
    # Reptant
    "rept",
    "reptant",
    # Stoloniferous
    "stoloniferous",
    # Rhizomatous
    "rhizomatous",
    "rhiz",
    # Shrub combinations
    "shrub/tree",
    "tree/shrub",
    "tree / shrub",
    "shrub / tree",
    "shrub_tree",
    "tree_shrub",
    "shrub/tree intermediate",
    "shrub | tree",
    "tree | shrub",
    "shrub|tree",
    "tree|shrub",
    "Terrestrial_Shrub",
    "terrestrial_shrub",
    # Shrub land
    "Shrubland",
    "shrubland",
    "Shrub forest belt",
    # Shrub, Subshrub
    "Shrub, Subshrub",
    "shrubs and sub-shrubs",
    # Shrub/Parasite
    "Shrub/Parasite",
    # Abbreviations
    "halfshrubs resp. s",
    "shrub-like_clumps",
    "shrub-like_herb",
    "se",
    "rus",
    # Desert
    "desert sub-shrubs",
    # Free
    "free",
    "free-standing",
    # Climber variations
    "climber/epiphyte",
    "climber/epiphyte/parasitic",
    "climber/epiphyte/succulent",
    "climber/fern",
    "climber/hemiepiphyte",
    "climber/herb",
    "climber/succulent",
    "climber/free/liana",
    "climber/free/shrub",
    "climber/free/understory",
    "climber/free/vine",
    "climber/parasitic",
    "climbing_epiphyte",
    "climibing_herb",  # Added (typo in original data)
]

# Convert all terms to lowercase and create a dictionary
search_terms = {
    "Tree": set([t.lower() for t in search_terms_tree]),
    "Shrub": set([t.lower() for t in search_terms_shrub]),
    "Grass": set([t.lower() for t in search_terms_grass]),
}

log.info("Assigning PFTs to species using improved matching...")
# Assign PFTs using the improved priority-based matching
df["pft"] = df["X42"].apply(lambda x: assign_pft_priority(x, search_terms))

# Drop rows without PFT assignment
df = df.dropna(subset=["pft"])

log.info("Grouping by species and selecting majority PFT...")
# Group by species and select the most common PFT (majority count)
df = (
    df.groupby("AccSpeciesName")
    .agg(
        pft=("pft", lambda x: x.mode()[0]),
    )
    .reset_index()
)


In [47]:
final_pfts_v1 = growth_form_filt.dropna(subset=["pft"]).groupby("AccSpeciesName").agg(
    pft=("pft", lambda x: x.mode()[0])
).reset_index()
final_pfts_v2 = df.copy()

In [48]:
print(f"final_pfts_v1.shape: {final_pfts_v1.shape}")
print(f"final_pfts_v2.shape: {final_pfts_v2.shape}")


final_pfts_v1.shape: (227995, 2)
final_pfts_v2.shape: (231903, 2)


In [51]:
# unmatched = df[df["pft"].isna()]
# unmatched_count = unmatched.shape[0]
v1_count = final_pfts_v1.shape[0]
v2_count = final_pfts_v2.shape[0]
original_sp_count = growth_form_filt.drop_duplicates(subset=["AccSpeciesName"]).shape[0]

print(
    f"V1 matched species: {(v1_count / original_sp_count):.2%} ({v1_count:,} of {original_sp_count:,} rows)"
)
print(
    f"V2 matched species: {(v2_count / original_sp_count):.2%} ({v2_count:,} of {original_sp_count:,} rows)"
)


V1 matched species: 97.77% (227,995 of 233,201 rows)
V2 matched species: 99.44% (231,903 of 233,201 rows)


Drop rows that weren't matched.

In [47]:
df = df.dropna(subset=["pft"])
df.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,OrigValueStr,pft
1,9228,Calamagrostis lapponica,graminoid,Grass
2,10400,Carex capitata,sedge,Grass
3,10647,Carex rostrata,sedge,Grass
4,10654,Carex saxatilis,sedge,Grass
5,10701,Carex vaginata,sedge,Grass


Group by `AccSpeciesID` and take the mode of each group to determine the appropriate PFT by majority count.

In [None]:
dat_pft = df.groupby("AccSpeciesID").agg(
    AccSpeciesName=("AccSpeciesName", "first"),
    pft=("pft", lambda x: x.mode()[0])
).reset_index()
dat_pft.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,pft
0,2,Abarema adenophora,Tree
1,3,Abarema barbouriana,Tree
2,5,Abarema curvicarpa,Tree
3,6,Abarema jupunba,Tree
4,7,Abarema laeta,Tree


Sanity-check the final PFT assignments.

In [50]:
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Quercus")])
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Taraxacum")])
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Carex")])
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Ilex")])

        AccSpeciesID           AccSpeciesName   pft
32706          45311            Quercus acuta  Tree
32707          45312       Quercus acutissima  Tree
32708          45313           Quercus afares  Tree
32709          45314        Quercus agrifolia  Tree
32710          45315             Quercus alba  Tree
...              ...                      ...   ...
117560        374251           Quercus x vaga  Tree
117561        374252       Quercus x venulosa  Tree
117562        374253        Quercus x wagneri  Tree
117563        374254     Quercus x walteriana  Tree
117564        374255  Quercus x willdenowiana  Tree

[380 rows x 3 columns]
        AccSpeciesID              AccSpeciesName    pft
37724          53053          Taraxacum absurdum  Grass
37725          53054       Taraxacum acervatulum  Grass
37726          53055       Taraxacum acroglossum  Grass
37727          53056        Taraxacum acrophorum  Grass
37728          53057       Taraxacum acutangulum  Grass
...             

In [2]:
import pandas as pd
old_pfts = pd.read_csv(
    "/mnt/gsdata/projects/panops/panops-data-registry/data/try/try_pft_v1.csv", 
    encoding="latin-1"
)
old_pfts.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,pft
0,9228,Calamagrostis lapponica,Grass
1,10400,Carex capitata,Grass
2,10647,Carex rostrata,Grass
3,10654,Carex saxatilis,Grass
4,10701,Carex vaginata,Grass
