In [1]:
!pip install polars



In [2]:
import polars as pl
import seaborn as sns

# Explore Taxa in [TreeOfLife-10M](https://huggingface.co/datasets/imageomics/TreeOfLife-10M)

Exploring the taxa in BeetlePalooza dataset that is in TreeOfLife-10M training set (CSVs from [here](https://github.com/Imageomics/BeetlePalooza-2024/issues/10#issuecomment-2272214418), were created by filtering the TreeOfLife-10M [catalog](https://huggingface.co/datasets/imageomics/TreeOfLife-10M/blob/main/metadata/catalog.csv) for the `<genus species>` pairs and just `<genus>` that were in the [Beetle CSV](https://huggingface.co/datasets/imageomics/2018-NEON-beetles/blob/main/BeetleMeasurements.csv)).

In [3]:
df_genus = pl.read_csv("data/Beetle_genus_overlap_tol.csv", low_memory = False)
df_sciName = pl.read_csv("data/Beetle_sciName_overlap_tol.csv", low_memory = False)

In [4]:
df_genus.head()

split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common,sciName
str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""train""","""7a5dcb0c-0bc9-4d20-8f07-64340a…",27776258.0,1014588.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Carabus""","""nemoralis""","""Wood Ground-beetle""","""Carabus nemoralis"""
"""train""","""5dd09d72-1aec-418f-b035-7203a9…",27813566.0,1015759.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Calathus""","""opaculus""","""Calathus opaculus""","""Calathus opaculus"""
"""train""","""65796502-0993-4f79-95cc-19221d…",20380197.0,1036420.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Cicindela""","""amargosae""","""Nye Tiger Beetle""","""Cicindela amargosae"""
"""train""","""ca4dbb32-2d98-449e-98c6-57d025…",20806174.0,1035962.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Dicaelus""","""teter""","""Dicaelus teter""","""Dicaelus teter"""
"""train""","""42711a3e-2f76-4ba8-a65b-3b8f62…",28558708.0,1037168.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Amara""","""gibba""","""Amara gibba""","""Amara gibba"""


In [6]:
TAXA_COLS = ["kingdom", "phylum", "class", "order", "family", "genus", "species", "common"]

In [9]:
df_genus[TAXA_COLS].count()

kingdom,phylum,class,order,family,genus,species,common
u32,u32,u32,u32,u32,u32,u32,u32
27485,27485,27485,27485,27485,27577,27260,27577


In [12]:
for col in TAXA_COLS:
    print(f"{col} has {df_genus[col].n_unique()} unique values")

kingdom has 2 unique values
phylum has 2 unique values
class has 2 unique values
order has 2 unique values
family has 2 unique values
genus has 34 unique values
species has 1501 unique values
common has 1703 unique values


Why are there 2 kingdoms? Interesting.

In [15]:
print(df_genus["kingdom"].unique())

shape: (2,)
Series: 'kingdom' [str]
[
	"Animalia"
	null
]


In [19]:
for col in TAXA_COLS:
    print(f"{col} has {df_genus.select(col).filter(pl.col(col).is_not_null()).n_unique()} unique values")

kingdom has 1 unique values
phylum has 1 unique values
class has 1 unique values
order has 1 unique values
family has 1 unique values
genus has 34 unique values
species has 1500 unique values
common has 1703 unique values


In [21]:
df_genus["family"].unique()

family
str
""
"""Carabidae"""


In [23]:
for genus in df_genus["genus"].unique():
    print(genus)

Amphasia
Sphaeroderus
Poecilus
Promecognathus
Cymindis
Omus
Discoderus
Euryderus
Cratacanthus
Calathus
Tetracha
Amara
Selenophorus
Chlaenius
Loxandrus
Bembidion
Pasimachus
Pterostichus
Elaphropus
Apristus
Axinopalpus
Agonum
Carabus
Syntomus
Scarites
Scaphinotus
Cyclotrachelus
Synuchus
Harpalus
Tetragonoderus
Cicindela
Calosoma
Dicaelus
Anisodactylus


[This](https://huggingface.co/datasets/imageomics/2018-NEON-beetles/blob/main/group_images/A00000003356.jpg) is the one genus not represented (Paraclivina), which is only labeled to genus level in biorepository and has only one individual. The other "missing genus" is for a sample only labeled to "Carabidae", so that wound up in genus label, but is not actually the genus; it's the family. These are from images A00000095905.jpg, A00000095368.jpg, and A00000055119.jpg, and each has only two annotations (suggesting just one beetle per image).

In [25]:
for genus in df_genus["genus"].unique():
    print(f"{genus} has {df_genus.filter(pl.col('genus') == genus).n_unique()} images in training set")

Chlaenius has 1156 images in training set
Apristus has 44 images in training set
Pterostichus has 1871 images in training set
Calathus has 421 images in training set
Loxandrus has 224 images in training set
Scarites has 371 images in training set
Agonum has 1070 images in training set
Elaphropus has 150 images in training set
Synuchus has 76 images in training set
Cratacanthus has 23 images in training set
Dicaelus has 256 images in training set
Tetragonoderus has 110 images in training set
Bembidion has 1959 images in training set
Carabus has 3718 images in training set
Sphaeroderus has 142 images in training set
Cymindis has 168 images in training set
Omus has 141 images in training set
Poecilus has 412 images in training set
Pasimachus has 312 images in training set
Calosoma has 1337 images in training set
Promecognathus has 51 images in training set
Anisodactylus has 437 images in training set
Scaphinotus has 608 images in training set
Euryderus has 27 images in training set
Cicind

Let's save this info into a CSV for easier reference.

In [42]:
genus_count = {}
for genus in df_genus["genus"].unique():
    genus_count[genus] = df_genus.filter(pl.col('genus') == genus).n_unique()

In [43]:
print(list(genus_count.keys()))
print(genus_count.values())

['Calathus', 'Axinopalpus', 'Loxandrus', 'Omus', 'Apristus', 'Amphasia', 'Cyclotrachelus', 'Sphaeroderus', 'Poecilus', 'Scarites', 'Tetracha', 'Euryderus', 'Cymindis', 'Dicaelus', 'Synuchus', 'Cratacanthus', 'Carabus', 'Anisodactylus', 'Pterostichus', 'Amara', 'Tetragonoderus', 'Promecognathus', 'Elaphropus', 'Selenophorus', 'Chlaenius', 'Cicindela', 'Pasimachus', 'Scaphinotus', 'Calosoma', 'Harpalus', 'Bembidion', 'Agonum', 'Discoderus', 'Syntomus']
dict_values([421, 50, 224, 141, 44, 75, 229, 142, 412, 371, 411, 27, 168, 256, 76, 23, 3718, 437, 1871, 1507, 110, 51, 150, 152, 1156, 8515, 312, 608, 1337, 1408, 1959, 1070, 52, 94])


In [44]:
count_df = pl.DataFrame(data = {"genus": list(genus_count.keys()),
                                "num_images": list(genus_count.values())})
count_df.head()

genus,num_images
str,i64
"""Calathus""",421
"""Axinopalpus""",50
"""Loxandrus""",224
"""Omus""",141
"""Apristus""",44


In [47]:
count_df.write_csv("data/genus_counts_inToL.csv")

In [13]:
df_sciName[TAXA_COLS].count()

kingdom,phylum,class,order,family,genus,species,common
u32,u32,u32,u32,u32,u32,u32,u32
2968,2968,2968,2968,2968,2968,2968,2968


In [20]:
for col in TAXA_COLS:
    print(f"{col} has {df_sciName.select(col).filter(pl.col(col).is_not_null()).n_unique()} unique values")

kingdom has 1 unique values
phylum has 1 unique values
class has 1 unique values
order has 1 unique values
family has 1 unique values
genus has 30 unique values
species has 66 unique values
common has 67 unique values


# Let's Look at the representation of the Family (Carabidae) in the full catalog

In [29]:
df = pl.read_csv("https://huggingface.co/datasets/imageomics/TreeOfLife-10M/resolve/main/metadata/catalog.csv",
                 low_memory = False).filter((pl.col("family") == "Carabidae") & (pl.col("split") != "train_small"))

df.head()

split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common
str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""train""","""7a5dcb0c-0bc9-4d20-8f07-64340a…",27776258.0,1014588.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Carabus""","""nemoralis""","""Wood Ground-beetle"""
"""train""","""9a6a23d5-9010-461e-a62a-7376dd…",20124362.0,64677644.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Trechus""","""clairville""","""Trechus clairville"""
"""train""","""384583c1-7780-4d28-823a-34e0db…",28636308.0,52609173.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Platynus""","""jamaicae""","""Platynus jamaicae"""
"""train""","""5dd09d72-1aec-418f-b035-7203a9…",27813566.0,1015759.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Calathus""","""opaculus""","""Calathus opaculus"""
"""train""","""9014f4e1-721f-4153-ac58-529e20…",13606588.0,32200886.0,,,,,,"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Carabidae""","""Mecyclothorax""","""tutei""","""Mecyclothorax tutei"""


In [30]:
df["genus"].n_unique()

688

In [31]:
df.count()

split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
49220,49220,42424,42424,672,672,6124,6124,6124,49220,49220,49220,49220,49220,48581,47067,49220


In [32]:
df_genus.count()

split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common,sciName
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
27577,27577,21417,21417,36,36,6124,6124,6124,27485,27485,27485,27485,27485,27577,27260,27577,27260


In [33]:
# Beetles not labeled to genus level
df.filter(pl.col("genus").is_null()).count()

split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
639,639,170,170,469,469,0,0,0,639,639,639,639,639,0,0,639


This suggests that there are about 21K images representing genera not in the BeetleMeasurements CSV.