In [1]:
import os
import pandas as pd
from pprint import pprint
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
gold_csv = "./resources/testing_dataset/lightfootcat/lightfootcat_9_mu_cleaned.csv"
eval_csv = "./outputs/lightfootcat_with_extract/lightfootcat_0.csv"

In [3]:
df_gold = pd.read_csv(gold_csv)
df_eval = pd.read_csv(eval_csv)

In [4]:
pprint(df_gold.columns)
pprint(df_eval.columns)

Index(['Unnamed: 0', 'division', 'family_name', 'species_name',
       'number_of_folders', 'number_of_sheets', 'description'],
      dtype='object')
Index(['Unnamed: 0', 'division', 'family_name', 'species_name',
       'number_of_folders', 'number_of_sheets', 'description',
       'folders_and_sheets', 'species'],
      dtype='object')


In [5]:
eval_columns = ['family_name', 'species_name','description']

In [6]:
len(df_gold["family_name"])

1382

In [7]:
len(df_eval["family_name"])

1404

In [8]:
import unicodedata, re
def clean(x, is_species=False):

    if pd.isna(x):
        return x
    x = unicodedata.normalize("NFKD", str(x)).strip().lower().replace("\\", "")
    if is_species:
        return " ".join(x.split(" ")[:3])
    return x

species_clean = lambda x: clean(x, is_species=True)
df_gold["family_name"] = df_gold["family_name"].map(clean)
df_gold["species_name"] = df_gold["species_name"].map(species_clean)
df_gold["description"] = df_gold["description"].map(clean)

df_eval["family_name"] = df_eval["family_name"].map(clean)
df_eval["species_name"] = df_eval["species_name"].map(species_clean)
df_eval["description"] = df_eval["description"].map(clean)

In [9]:
df_gold.head()

Unnamed: 0.1,Unnamed: 0,division,family_name,species_name,number_of_folders,number_of_sheets,description
0,0,Dicotyledones,aceraceae,acer campestre l.,1.0,0.0,1 folder. acer campestre l.
1,1,Dicotyledones,aceraceae,acer pseudoplatanus l.,2.0,0.0,"folder 1. acer pseudo-platanus l. [g]. i. ""map..."
2,2,Dicotyledones,aceraceae,acer pseudoplatanus l.,2.0,0.0,folder 2. acer pseudo-platanus [ta]
3,3,Dicotyledones,amaranthaceae,"amaranthus lividus l.,",1.0,0.0,1 folder. amaranthus blitum [ta].
4,4,Dicotyledones,araliaceae,hedera helix l.,1.0,0.0,1 folder. hedera helix [ta]


In [10]:
" ".join(df_gold["species_name"][0].split(" ")[:3])

'acer campestre l.'

#### Precision, Recall and F1

Precision = TP / (TP + FP)
Recall = TP / (TP + FN)

F1 = 2 x Precision x Recall / (Precision + Recall)


### Family

Compare (family, species) pairs as a set

### Species Name

Compare species occurences as a set

In [11]:
def eval(gold_set, eval_set):
    tp = len(gold_set & eval_set)
    fp = len(eval_set - gold_set)
    fn = len(gold_set - eval_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

In [12]:
species_gold = set(df_gold["species_name"].dropna().unique())
species_eval = set(df_eval["species_name"].dropna().unique())

sp_precision, sp_recall, sp_f1 = eval(species_gold, species_eval)
pprint(f"Species Name - Precision: {sp_precision:.4f}, Recall: {sp_recall:.4f}, F1 Score: {sp_f1:.4f}")

'Species Name - Precision: 0.8751, Recall: 0.8741, F1 Score: 0.8746'


In [13]:
family_gold = set(tuple(x) for x in df_gold[["family_name", "species_name"]].dropna().values)
family_eval = set(tuple(x) for x in df_eval[["family_name", "species_name"]].dropna().values)
fam_precision, fam_recall, fam_f1 = eval(family_gold, family_eval)
pprint(f"Family Name - Precision: {fam_precision:.4f}, Recall: {fam_recall:.4f}, F1 Score: {fam_f1:.4f}")

'Family Name - Precision: 0.8630, Recall: 0.8630, F1 Score: 0.8630'


In [14]:
df_gold.head()

Unnamed: 0.1,Unnamed: 0,division,family_name,species_name,number_of_folders,number_of_sheets,description
0,0,Dicotyledones,aceraceae,acer campestre l.,1.0,0.0,1 folder. acer campestre l.
1,1,Dicotyledones,aceraceae,acer pseudoplatanus l.,2.0,0.0,"folder 1. acer pseudo-platanus l. [g]. i. ""map..."
2,2,Dicotyledones,aceraceae,acer pseudoplatanus l.,2.0,0.0,folder 2. acer pseudo-platanus [ta]
3,3,Dicotyledones,amaranthaceae,"amaranthus lividus l.,",1.0,0.0,1 folder. amaranthus blitum [ta].
4,4,Dicotyledones,araliaceae,hedera helix l.,1.0,0.0,1 folder. hedera helix [ta]


In [15]:
len(species_gold)

882

In [16]:
len(species_eval)

881

In [17]:
species_eval

{'acer campestre l.',
 'acer pseudoplatanus l.',
 'aceras anthropophorum (l.)',
 'acinos arvensis (lam.)',
 'actaea spicata l.',
 'adonis annua l.',
 'aegopodium podagraria l.',
 'agrimonia eupatoria l.',
 'agropyron caninum (l.)',
 'agropyron junceiforme (a.&d.löve)',
 'agropyron pungens (pers.)',
 'agropyron repens (l.)',
 'aira praecox l.',
 'ajuga chamaepitys (l.)',
 'ajuga reptans l.',
 'alchemilla alpina l.',
 'alchemilla glabra neygenfind',
 'alchemilla vestita (buser)',
 'alchemilla xanthochlora rothm.',
 'alliaria petiolata (bieb.)',
 'allium oleraceum l.',
 'allium schoenoprasum l.',
 'allium scorodoprasum l.',
 'allium ursinum l.',
 'althaea officinalis l.',
 'amaranthus lividus l.,',
 'ammophila arenaria (l.)',
 'anacamptis pyramidalis (l.)',
 'anagallis minima (l.)',
 'anagallis monelli l.',
 'anaphalis margaritacea (l.)',
 'anemone nemorosa l.',
 'antennaria dioica (l.)',
 'antennaria neodioica greene',
 'anthemis arvensis l.',
 'anthemis cotula l.',
 'anthoceros spp.',


In [18]:
species_gold

{'acer campestre l.',
 'acer pseudoplatanus l.',
 'aceras anthropophorum (l.)',
 'acinos arvensis (lam.)',
 'actaea spicata l.',
 'adonis annua l.',
 'aegopodium podagraria l.',
 'agrimonia eupatoria l.',
 'agropyron caninum (l.)',
 'agropyron junctiforme -',
 'agropyron pungens (pers.)',
 'agropyron repens (l.)',
 'aira praecox l.',
 'ajuga chamaepitys (l.)',
 'ajuga reptans l.',
 'alchemilla alpina l.',
 'alchemilla glabra neygenfind',
 'alchemilla vestita (buser)',
 'alchemilla xanthochlora rothm.',
 'alliaria petiolata (bieb.)',
 'allium oleraceum l.',
 'allium schoenoprasum l.',
 'allium scorodoprasum l.',
 'allium ursinum l.',
 'alnus glutinosa (l.)',
 'althaea officinalis l.',
 'amaranthus lividus l.,',
 'ammophila arenaria (l.)',
 'amostris minima (l.)',
 'anacamptis pyramidalis (l.)',
 'anagallis minima (l.)',
 'anagallis monelli l.',
 'anaphalis margaritacea (l.)',
 'anemone nemorosa l.',
 'antennaria dioica (l.)',
 'anthemis arvensis l.',
 'anthemis cotula l.',
 'anthoceros 

In [None]:
def get_description_pairs(gold_set, eval_set, pairs):

    desc_pairs = ()

    for pair in pairs:
        if pair[0] in gold_set and pair[1] in eval_set:
            yield pair

In [43]:
pairs = (family_gold & family_eval)

gold_desc = []
eval_desc = []

for pair in pairs:
    fam_ = pair[0]
    sp_ = pair[1]
    desc_gold = df_gold[(df_gold["family_name"] == fam_) & (df_gold["species_name"] == sp_)]["description"].values[0]
    desc_eval = df_eval[(df_eval["family_name"] == fam_) & (df_eval["species_name"] == sp_)]["description"].values[0]

    if pd.isna(desc_gold) or pd.isna(desc_eval):
        continue
    gold_desc.append(desc_gold)
    eval_desc.append(desc_eval)

In [44]:
gold_desc

['teucrium - scorodonia ta.',
 'folder 1. potentilla verna aschers., flora scotica ed. 1, 270 (1777). "vincents rock & gloddaeth" [jl]. "potentill: vern: scotland. mr. yald" [jl]. "ne 26. torment. var. with 5 pet.-found near the top of creg-chaillech" [stuart]; *potentilla verna [perennial, may and june]" [jl]. [probably contains a mixture of p. tabernaemontani htfoot and p. crantzit (crantz) g.beck ex fritsch.]',
 '1 folder. ononis spinosa ta.',
 'folder 1. orchis bifolia [g], flora scotica ed. 1, 512 (1777)',
 'folder 1. tormentilla reptans l., flora scotica ed. 1, 273 (1777). "tormentilla reptans near newent. solander" [jl]. "pentaphyllum reptans, in the road to percy lodge" [jl]. [specimens marked *x" in pencil are p. anglica or hybrids].',
 'triticum repens [ta], flora scotica ed. 1, 109 (1777). i. cites lin.; huds.; ray\'s syn. 390. \r\n ii. "dogs-grass or couch-grass. in fields & hedges almost everywhere, from june to august. it appears from the sample marked, à [var. aristatum

In [45]:
from jiwer import wer, cer

In [46]:
wer(gold_desc, eval_desc)

0.23712227121195828

In [47]:
cer(gold_desc, eval_desc)

0.18260892485305358

In [56]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)
scores = [scorer.score(g, e)["rougeL"] for g, e in zip(gold_desc, eval_desc)]


precision = []
recall = []
f1 = []

for g, e in zip(gold_desc, eval_desc):
    score = scorer.score(g, e)["rougeL"]
    precision.append(score.precision)
    recall.append(score.recall)
    f1.append(score.fmeasure)

precision = sum(precision) / len(precision)
recall = sum(recall) / len(recall)
f1 = sum(f1) / len(f1)

In [57]:
print(f"Description - Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Description - Precision: 0.8995, Recall: 0.9266, F1 Score: 0.8914


In [58]:
print("Final Summary:")
print(f"Description - Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
print(f"Species Name - Precision: {sp_precision:.4f}, Recall: {sp_recall:.4f}, F1 Score: {sp_f1:.4f}")
print(f"Family Name - Precision: {fam_precision:.4f}, Recall: {fam_recall:.4f}, F1 Score: {fam_f1:.4f}")

Final Summary:
Description - Precision: 0.8995, Recall: 0.9266, F1 Score: 0.8914
Species Name - Precision: 0.8751, Recall: 0.8741, F1 Score: 0.8746
Family Name - Precision: 0.8630, Recall: 0.8630, F1 Score: 0.8630
