# Assigning PFTs to TRY species

Explore the contents of the `TRY_Life_Forms` archive.

In [1]:
import os
from pathlib import Path
import zipfile

# Set working directory
os.chdir("../../data/try")
zf_path = Path("./TRY_Life_Forms.zip")

# Open the zip file and read contents
with zipfile.ZipFile(zf_path, 'r') as zip_ref:
    zip_ref.printdir()

File Name                                             Modified             Size
TRY_Life_Forms/                                2024-10-25 19:03:34            0
TRY_Life_Forms/19233.txt                       2024-10-25 19:02:40  10325171130
TRY_Life_Forms/19233_28012022062655.zip        2024-10-25 19:03:28    189898347
TRY_Life_Forms/TRY_Data_Release_Notes.pdf      2024-10-25 19:03:32       112010
TRY_Life_Forms/TRY_Intellectual_Property_Guidelines.pdf 2024-10-25 19:03:34       165948


Read the contents of `19233.txt` into memory (it's large, around 10GB)

In [34]:
import dask.dataframe as dd
import pandas as pd

pd.set_option("display.max_columns", None)

with zipfile.ZipFile(zf_path, "r") as zip_ref:
    with zip_ref.open("TRY_Life_Forms/19233.txt") as f:
        df = pd.read_csv(f, delimiter="\t", encoding="ISO-8859-1").drop(columns=["Unnamed: 27"])

df.head()

  df = pd.read_csv(f, delimiter="\t", encoding="ISO-8859-1").drop(columns=["Unnamed: 27"])


Unnamed: 0,LastName,FirstName,DatasetID,Dataset,SpeciesName,AccSpeciesID,AccSpeciesName,ObservationID,ObsDataID,TraitID,TraitName,DataID,DataName,OriglName,OrigValueStr,OrigUnitStr,ValueKindName,OrigUncertaintyStr,UncertaintyName,Replicates,StdValue,UnitName,RelUncertaintyPercent,OrigObsDataID,ErrorRisk,Reference,Comment
0,Cornelissen,Johannes,1,Abisko & Sheffield Database,Bartsia alpina,6930,Bartsia alpina,15149,432413,42.0,Plant growth form,47,Plant growth form,growth form,HEMI-PARASITE,,,,,,,,,,,"Cornelissen, J. H. C., H. M. Quested, D. Gwynn...","HEMI-PARASITE, GRAMINOID, SEDGE, FERN ALLY, CA..."
1,Cornelissen,Johannes,1,Abisko & Sheffield Database,Calamagrostis lapponica,9228,Calamagrostis lapponica,15152,432421,42.0,Plant growth form,47,Plant growth form,growth form,GRAMINOID,,,,,,,,,,,"Cornelissen, J. H. C., H. M. Quested, D. Gwynn...","HEMI-PARASITE, GRAMINOID, SEDGE, FERN ALLY, CA..."
2,Cornelissen,Johannes,1,Abisko & Sheffield Database,Carex capitata,10400,Carex capitata,15154,432426,42.0,Plant growth form,47,Plant growth form,growth form,SEDGE,,,,,,,,,,,"Cornelissen, J. H. C., H. M. Quested, D. Gwynn...","HEMI-PARASITE, GRAMINOID, SEDGE, FERN ALLY, CA..."
3,Cornelissen,Johannes,1,Abisko & Sheffield Database,Carex rostrata,10647,Carex rostrata,15155,432428,42.0,Plant growth form,47,Plant growth form,growth form,SEDGE,,,,,,,,,,,"Cornelissen, J. H. C., H. M. Quested, D. Gwynn...","HEMI-PARASITE, GRAMINOID, SEDGE, FERN ALLY, CA..."
4,Cornelissen,Johannes,1,Abisko & Sheffield Database,Carex saxatilis,10654,Carex saxatilis,15156,432430,42.0,Plant growth form,47,Plant growth form,growth form,SEDGE,,,,,,,,,,,"Cornelissen, J. H. C., H. M. Quested, D. Gwynn...","HEMI-PARASITE, GRAMINOID, SEDGE, FERN ALLY, CA..."


We'll only need a few columns, so drop the unneeded ones.

In [35]:
df = df[["AccSpeciesID", "AccSpeciesName", "OrigValueStr"]].assign(
    OrigValueStr=lambda _df: _df["OrigValueStr"].str.lower()
)
df.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,OrigValueStr
0,6930,Bartsia alpina,hemi-parasite
1,9228,Calamagrostis lapponica,graminoid
2,10400,Carex capitata,sedge
3,10647,Carex rostrata,sedge
4,10654,Carex saxatilis,sedge


Optimize the columns with pyarrow.

In [26]:
df["OrigValueStr"].nunique() / len(df)

0.06686574442643144

In [36]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21137804 entries, 0 to 21137803
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   AccSpeciesID    int64 
 1   AccSpeciesName  object
 2   OrigValueStr    object
dtypes: int64(1), object(2)
memory usage: 3.0 GB


In [42]:
df = df.astype(
    {
        "AccSpeciesID": "uint32[pyarrow]",
        "AccSpeciesName": "string[pyarrow]",
        "OrigValueStr": "category",
    }
)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21137804 entries, 0 to 21137803
Data columns (total 3 columns):
 #   Column          Dtype          
---  ------          -----          
 0   AccSpeciesID    uint32[pyarrow]
 1   AccSpeciesName  string         
 2   OrigValueStr    category       
dtypes: category(1), string(1), uint32[pyarrow](1)
memory usage: 810.5 MB


Define the search terms to encompass the different PFTs.

In [43]:
search_terms_tree = [
    "tree",
    "Tree",
    "TREE",
    "seedling",
    "hardwood",
    "softwood",
    "Hardwood",
    "Softwood",
    "Tree_Fern",
    "Tree_",
    "Small_Tree",
    "trees/T/Tree",
    "Tree/Tree",
    "Tree V",
    "Tree VII",
    "Tree IX",
    "Tree V",
    "Tree IV",
    "Tree III",
    "Tree II",
    "Tree I",
    "Tree/Treelet",
    "Treen",
    "T/Tree",
    "T/tree/Tree",
    "tree/Tree",
    "trees/Shrub",
    "trees/T/tree/Tree",
    "Tree/Shrub",
    "Trees",
    "trees",
    "Tree_Shrub",
    "Shrub_Tree",
    "Tree | Shrub",
    "Shrub | Tree",
    "Tree | Tree",
    "USforestTrees",
]

search_terms_grass = [
    "herb",
    "Herb",
    "HERB",
    "herbs",
    "graminoid",
    "Graminoid",
    "GRAMINOID",
    "Forb",
    "forb",
    "Grasses&Sedges",
    "Grass",
    "grass",
    "GRASS",
    "sedge",
    "SEDGE",
    "fern",
    "Fern",
    "FERN",
    "Grassland",
    "Annual Grass",
    "Perennial Grass",
    "grassland",
]

search_terms_shrub = [
    "shrub",
    "Shrub",
    "SHRUB",
    "seedling",
    "vine",
    "Vine",
    "VINE",
    "liana",
    "Liana",
    "LIANA",
    "Terrestrial_Shrub",
    "Shrub forest belt",
    "Dwarf Shrub community",
    "Shrub/Aquatic",
    "Shrub/Aquatic",
    "Shrub/Parasite",
    "Shrubs",
    "Shrubland",
    "shrubland",
    "Shrub, Subshrub",
    "Shrub",
]

Convert all search terms to lowercase and remove redundant terms.

In [44]:
search_terms_tree = set([t.lower() for t in search_terms_tree])
search_terms_grass = set([t.lower() for t in search_terms_grass])
search_terms_shrub = set([t.lower() for t in search_terms_shrub])

Add new standardized PFT column to `df`.

In [None]:
# Replace values in OrigValueStr column
df.loc[df["OrigValueStr"].isin(search_terms_tree), "pft"] = "Tree"
df.loc[df["OrigValueStr"].isin(search_terms_grass), "pft"] = "Grass"
df.loc[df["OrigValueStr"].isin(search_terms_shrub), "pft"] = "Shrub"

In [46]:
df.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,OrigValueStr,pft
0,6930,Bartsia alpina,hemi-parasite,
1,9228,Calamagrostis lapponica,graminoid,Grass
2,10400,Carex capitata,sedge,Grass
3,10647,Carex rostrata,sedge,Grass
4,10654,Carex saxatilis,sedge,Grass


Drop rows that weren't matched.

In [47]:
df = df.dropna(subset=["pft"])
df.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,OrigValueStr,pft
1,9228,Calamagrostis lapponica,graminoid,Grass
2,10400,Carex capitata,sedge,Grass
3,10647,Carex rostrata,sedge,Grass
4,10654,Carex saxatilis,sedge,Grass
5,10701,Carex vaginata,sedge,Grass


Group by `AccSpeciesID` and take the mode of each group to determine the appropriate PFT by majority count.

In [None]:
dat_pft = df.groupby("AccSpeciesID").agg(
    AccSpeciesName=("AccSpeciesName", "first"),
    pft=("pft", lambda x: x.mode()[0])
).reset_index()
dat_pft.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,pft
0,2,Abarema adenophora,Tree
1,3,Abarema barbouriana,Tree
2,5,Abarema curvicarpa,Tree
3,6,Abarema jupunba,Tree
4,7,Abarema laeta,Tree


Sanity-check the final PFT assignments.

In [50]:
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Quercus")])
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Taraxacum")])
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Carex")])
print(dat_pft[dat_pft["AccSpeciesName"].str.contains("Ilex")])

        AccSpeciesID           AccSpeciesName   pft
32706          45311            Quercus acuta  Tree
32707          45312       Quercus acutissima  Tree
32708          45313           Quercus afares  Tree
32709          45314        Quercus agrifolia  Tree
32710          45315             Quercus alba  Tree
...              ...                      ...   ...
117560        374251           Quercus x vaga  Tree
117561        374252       Quercus x venulosa  Tree
117562        374253        Quercus x wagneri  Tree
117563        374254     Quercus x walteriana  Tree
117564        374255  Quercus x willdenowiana  Tree

[380 rows x 3 columns]
        AccSpeciesID              AccSpeciesName    pft
37724          53053          Taraxacum absurdum  Grass
37725          53054       Taraxacum acervatulum  Grass
37726          53055       Taraxacum acroglossum  Grass
37727          53056        Taraxacum acrophorum  Grass
37728          53057       Taraxacum acutangulum  Grass
...             

In [2]:
import pandas as pd
old_pfts = pd.read_csv(
    "/mnt/gsdata/projects/panops/panops-data-registry/data/try/try_pft_v1.csv", 
    encoding="latin-1"
)
old_pfts.head()

Unnamed: 0,AccSpeciesID,AccSpeciesName,pft
0,9228,Calamagrostis lapponica,Grass
1,10400,Carex capitata,Grass
2,10647,Carex rostrata,Grass
3,10654,Carex saxatilis,Grass
4,10701,Carex vaginata,Grass
