In [2]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import json
import os
from matplotlib_venn import venn2
from pathlib import Path
from PIL import Image
from IPython.display import display
import random
import math

## Version 1

In [None]:
prd_path = "/home/george/codes/lepinet/data/flemming/preds_04-lepi-prod_model1.csv" 
lbl_path = "/home/george/codes/lepinet/data/flemming/normalized.csv"
prd_lbl_path = "/home/george/codes/lepinet/data/flemming/preds_04-lepi-prod_model1_label.csv" 
img_path = "/home/george/codes/lepinet/data/flemming/images"
name2id_path = "/home/george/codes/lepinet/data/flemming/name2id.csv"
hier_path = "/home/george/codes/lepinet/data/lepi/hierarchy_all.json"

Load ground truth labels

In [None]:
lbl=pd.read_csv(lbl_path)

In [None]:
lbl.head(20)

In [None]:
"genus".upper()

In [None]:
name="Orthosia"
rank="genus"
# x=requests.get(f"https://api.gbif.org/v1/species/suggest?higherTaxonKey=797&q={name}&rank={rank.upper()}")
x=requests.get(f"https://api.gbif.org/v1/species/match?order=Lepidoptera&{rank}={name}")
x.json()

In [None]:
def get_key(name, rank, higherTaxonKey='797'):
    """Returns taxon key from name and rank.

    Notes
    -----

    Requests GBIF API. 

    If GBIF API returns more than one element, display a warning and return the first element from the list.
    """

    x=requests.get(f"https://api.gbif.org/v1/species/suggest?{higherTaxonKey}=797&q={name}&rank={rank.upper()}")
    l=x.json()
    if len(l) > 1:
        print(f"Warning: more than one element found in GBIF reply: {l}")
    return l[0]['key']

# Find the accepted GBIF keys
name2id = {
    'verbatimScientificName':[],
    'familyKey':[],
    'genusKey':[],
    'speciesKey':[]
}
for i, row in lbl.iterrows():
    if row['rank'] != 'SPECIES':
        print(f"Warning: wrong rank for row {i} : {row}.")
        continue

    speciesKey = row['usageKey'] if pd.isna(row['acceptedUsageKey']) else row['acceptedUsageKey']
    genusKey = get_key(row['genus'], 'genus')
    familyKey = get_key(row['family'], 'family')
    name2id['verbatimScientificName'].append(row['verbatimScientificName'])
    name2id['familyKey'].append(familyKey)
    name2id['genusKey'].append(genusKey)
    name2id['speciesKey'].append(speciesKey)

print(name2id)

New attempt by directly using GBIF API fuzzy match system

In [None]:
name = "Tethea or (Denis & Schiffermüller), 1776"
x=requests.get(f"https://api.gbif.org/v1/species/match?order=Lepidoptera&scientificName={name}&strict=True&rank=SPECIES")
x.json()

In [None]:
def get_key(scientificName=None, usageKey=None, rank='SPECIES', order='Lepidoptera'):
    """Returns taxon key from scientific name.

    Notes
    -----

    Requests GBIF API. 

    If GBIF API returns more than one element, display a warning and return the first element from the list.
    """

    url = "https://api.gbif.org/v1/species/match?"

    assert usageKey is not None or scientificName is not None, "One of scientificRank or usageKey must be defined."

    if usageKey is not None:
        url += f"usageKey={usageKey}&"
    if scientificName is not None:
        url += f"scientificName={scientificName}&"
    if rank is not None:
        url += f"rank={rank}&"
    if order is not None:
        url += f"order={order}"

    x=requests.get(url)
    return x.json()

# list folder names
foldernames = os.listdir(img_path)

name2id = {
    'verbatimScientificName':[],
    'familyKey':[],
    'genusKey':[],
    'speciesKey':[]
}

for i, f in enumerate(foldernames):
    k=get_key(scientificName=f)
    if k['rank']!='SPECIES':
        print(f"Wrong rank for {f} : {k}")
    if f == 'Tethea or': # Bug fix with GBIF Species API, Tethea or gives the order
        k=get_key(usageKey="5142971")
    name2id['verbatimScientificName'].append(f)
    name2id['familyKey'].append(k['familyKey'])
    name2id['genusKey'].append(k['genusKey'])
    speciesKey = k['usageKey'] if 'acceptedUsageKey' not in k.keys() else k['acceptedUsageKey']
    name2id['speciesKey'].append(speciesKey)

In [None]:
df=pd.DataFrame(name2id)
df.to_csv(name2id_path, index=False)

In [None]:
df.head(20)

Let's make some Venn diagram with model vocab

In [None]:
with open(hier_path, "r") as f:
    hier=json.load(f)
model_families = list(hier.keys())
model_genuses = []
model_species = []
for fk, fv in hier.items():
    for gk, gv in fv.items():
        model_genuses.append(gk)
        for s in gv:
            model_species.append(s)

In [None]:
len(model_families), len(model_genuses), len(model_species)

In [None]:
model_species[0]

In [None]:
name2id['speciesKey'][0]

In [None]:
model_species = set([int(f) for f in model_species])
target_species = set(name2id['speciesKey'])
venn2((model_species, target_species), set_labels=("Model vocab", "Target vocab"))
plt.show()

In [None]:
model_genuses = set([int(f) for f in model_genuses])
target_genuses = set(name2id['genusKey'])
venn2((model_genuses, target_genuses), set_labels=("Model vocab", "Target vocab"))
plt.show()

In [None]:
forgotten_species=target_species-model_species

def gbif_match(id):
    x=requests.get(f"https://api.gbif.org/v1/species/match?usageKey={id}")
    return x.json()

[gbif_match(f) for f in forgotten_species]


Get the model predictions

A model prediction is a tuple (level, prediction, confidence).

The level is one of (None, species, genus, family)

None means that the model did not find anything in the images.

In [None]:
prd=pd.read_csv(prd_path)

In [None]:
prd[:6].to_csv("/home/george/codes/lepinet/data/flemming/preds_04-lepi-prod_model1-sample.csv", index=False)

Let's add the label column

In [None]:
def filename2taxa(filename: str, df: pd.DataFrame) -> dict:
    """Example: 
    In: 'flemming/images/Orthosia incerta/crop_TRAPNAME.jpg', name2id DataFrame
    Out: {'verbatimScientificName': {0: 'Orthosia incerta'}, 'familyKey': {0: 7015}, 'genusKey': {0: 1798902}, 'speciesKey': {0: 1799135}}
    """
    f = Path(filename).parent.name
    return df[df['verbatimScientificName']==f].iloc[0].to_dict()

lbls = []

level2rank = {
    0:'speciesKey',
    1:'genusKey',
    2:'familyKey'
}

for i, row in prd.iterrows():
    taxa=filename2taxa(row['filename'], df=df)
    lbls.append(taxa[level2rank[row['level']]])

lbls[:10]

In [None]:
prd['label']=lbls

In [None]:
prd[['level','prediction','confidence','label']].tail(6)

In [None]:
# prd.to_csv(prd_lbl_path, index=False)
prd = pd.read_csv(prd_lbl_path)

Let's get some metrics!

In [None]:
def get_accuracy(df):
    # Add an instance ID for grouping every 3 rows as one instance
    df['instance_id'] = df.index // 3

    # Set the confidence threshold
    threshold = 0.5

    correct_count = 0
    total_instances = df['instance_id'].nunique()

    # Group by each instance
    for instance_id, group in df.groupby('instance_id'):
        group_sorted = group.sort_values('level')  # Ensure levels are sorted from 0 to 2
        for _, row in group_sorted.iterrows():
            if row['confidence'] >= threshold:
                if row['prediction'] == row['label']:
                    correct_count += 1
                break  # Only consider the *lowest* confident level
        # If no level passed threshold → no increment (considered incorrect)

    accuracy = correct_count / total_instances
    print(f"Accuracy at confidence threshold {threshold}: {accuracy:.2%}")
    return accuracy
acc = get_accuracy(prd)

🔹 1. Coverage

Definition: Proportion of instances where the model made any prediction (i.e., had confidence ≥ threshold at some level).

Rationale: Measures how often the model is confident enough to make a prediction.

Formula:

coverage = num_instances_with_prediction / total_instances

🔹 2. Correct @ Level

Definition: Accuracy at each level (0, 1, 2), conditional on that level being the one used for the prediction.

Rationale: Helps diagnose at which levels the model tends to be more or less reliable.

🔹 3. Average Prediction Level

Definition: The average level (0 being lowest) at which a prediction was made.

Rationale: Reflects how "deep" the model usually goes before being confident enough — useful for understanding how fine-grained predictions are.

Lower is better (if lower levels are more specific and desirable).

🔹 4. Threshold-based Precision/Recall (Hierarchical Precision/Recall)

For multi-level classification, define:

Hierarchical Precision: Did the model predict the correct label at any level above the threshold?

Hierarchical Recall: Out of all levels, how many did it get right when confidence allowed a prediction?

Can be defined flexibly based on your use case (e.g., allowing partial credit for parent-level matches).

🔹 5. No-Prediction Rate

Definition: Proportion of instances where no prediction was made due to all confidence values being below the threshold.

Helps quantify abstention behavior.

🔹 6. Mean Confidence of Correct vs Incorrect Predictions

Helps to see whether confidence is calibrated — i.e., are confident predictions actually more likely to be correct?

🔹 7. Hierarchical Distance (Optional)

If you have a tree/graph structure of the labels, compute the distance between predicted and true labels in the hierarchy.

This allows "almost correct" predictions to be graded more gracefully than flat accuracy.

In [None]:
import pandas as pd
import numpy as np

# Core evaluation function to identify predictions above threshold per instance
def evaluate_predictions(df, threshold=0.5, merge=True):
    """
    Returns a summary dataframe with one row per instance:
    - prediction_level: level where prediction was made (lowest with confidence >= threshold)
    - correct: whether the prediction was correct
    - prediction_made: whether any prediction was made
    """
    if not 'instance_id' in df.keys():
        df['instance_id'] = df.index // 3

    summary = []

    for instance_id, group in df.groupby('instance_id'):
        group_sorted = group.sort_values('level')
        pred_made = False
        for _, row in group_sorted.iterrows():
            if row['confidence'] >= threshold:
                pred_made = True
                summary.append({
                    'instance_id': instance_id,
                    'prediction_level': row['level'],
                    'correct': int(row['prediction'] == row['label']),
                    'prediction_made': 1
                })
                break
        if not pred_made:
            summary.append({
                'instance_id': instance_id,
                'prediction_level': np.nan,
                'correct': 0,
                'prediction_made': 0
            })

    summary = pd.DataFrame(summary)
    if merge:
        summary = df.merge(summary, on='instance_id')
    return summary

# Accuracy
def accuracy(summary_df):
    return summary_df['correct'].mean()

# Coverage
def coverage(summary_df):
    return summary_df['prediction_made'].mean()

# Coverage per level
def coverage_per_level(summary_df):
    return summary_df['prediction_level'].value_counts(dropna=False).sort_index()/len(summary_df)

# Correct @ Level
def correct_at_each_level(summary_df):
    level_accuracy = summary_df.dropna().groupby('prediction_level')['correct'].mean()
    return level_accuracy.to_dict()

# Average Prediction Level
def average_prediction_level(summary_df):
    return summary_df['prediction_level'].mean()

# No Prediction Rate
def no_prediction_rate(summary_df):
    return 1 - coverage(summary_df)

# Mean Confidence of Correct vs Incorrect Predictions
def confidence_stats(summary_df):
    result = {}
    for outcome in [0, 1]:
        ids = summary_df.loc[summary_df['correct'] == outcome, 'instance_id']
        subset = summary_df[summary_df['instance_id'].isin(ids)]
        result[f'mean_confidence_correct_{outcome}'] = subset['confidence'].mean()
    return result

# Run all metrics in one call
def evaluate_all_metrics(summary):
    return {
        'accuracy': accuracy(summary),
        'coverage': coverage(summary),
        'coverage_per_level' : coverage_per_level(summary),
        'average_prediction_level': average_prediction_level(summary),
        'correct_at_each_level': correct_at_each_level(summary),
        **confidence_stats(summary)
    }

In [None]:
summary = evaluate_predictions(prd, threshold=0.5)

In [None]:
summary.head()

In [None]:
summary[:6].to_csv("/home/george/codes/lepinet/data/flemming/preds_04-lepi-prod_model1_label-sample.csv", index=False)

In [None]:
evaluate_all_metrics(summary)

Show some predictions

In [None]:
def show_image_examples(summary_df, n=5):
    def get_instance_ids(filter_fn):
        subset = summary_df[filter_fn(summary_df)]
        return subset.sample(n=min(n, len(subset)))['instance_id'].values

    categories = [
        ("✅ Good prediction (correct at level 0)", lambda s: (s['correct'] == 1) & (s['prediction_level'] == 0)),
        ("❌ Incorrect prediction despite high confidence (> 0.9)", lambda s: (s['correct'] == 0) & (s['prediction_level'] == 0) & (s['confidence'] > 0.9)),
        ("🤔 Model abstained (no confidence ≥ threshold)", lambda s: (s['prediction_made'] == 0)),
    ]

    for title, filter_fn in categories:
        instance_ids = get_instance_ids(filter_fn)
        if len(instance_ids) == 0:
            print(f"\n--- {title} ---\nNo examples found.\n")
            continue

        print(f"\n--- {title} ({len(instance_ids)} example(s)) ---")
        fig, axes = plt.subplots(1, len(instance_ids), figsize=(4 * len(instance_ids), 4))
        if len(instance_ids) == 1:
            axes = [axes]

        for ax, instance_id in zip(axes, instance_ids):
            instance_rows = summary_df[summary_df['instance_id'] == instance_id].sort_values('level')
            img_path = instance_rows.iloc[0]['filename']
            if not os.path.exists(img_path):
                print(f"Image not found: {img_path}")
                continue

            img = Image.open(img_path)
            ax.imshow(img)
            ax.axis('off')

            pred_level = instance_rows.iloc[0]['prediction_level']
            correct = instance_rows.iloc[0]['correct']
            pred = instance_rows.iloc[0]['prediction']
            label = instance_rows.iloc[0]['label']
            ax.set_title(f"Level: {pred_level}\nCorrect: {bool(correct)}\nPrediction: {pred}\nLabel: {label}")

        plt.tight_layout()
        plt.show()

In [None]:
show_image_examples(summary, n=5)

In [None]:
def display_prediction_and_samples(input_image_path, predicted_class, true_class, dataset_root, n_samples=6):
    """
    Displays:
    1. The input image
    2. n_samples images from the predicted class
    3. n_samples images from the true class

    Parameters:
    - input_image_path (str): Path to the input image.
    - predicted_class (str): Predicted class name.
    - true_class (str): Ground truth class name.
    - dataset_root (str): Root directory containing class subfolders.
    - n_samples (int): Number of sample images to show for each class.
    """
    
    def get_sample_images(class_name):
        class_dir = os.path.join(dataset_root, class_name)
        if not os.path.isdir(class_dir):
            raise ValueError(f"Class directory '{class_dir}' not found.")
        image_files = [f for f in os.listdir(class_dir)
                       if os.path.isfile(os.path.join(class_dir, f)) and
                       f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))]
        if not image_files:
            raise ValueError(f"No image files found in class directory '{class_dir}'.")
        selected = random.choices(image_files, k=n_samples) if len(image_files) < n_samples else random.sample(image_files, n_samples)
        return [os.path.join(class_dir, f) for f in selected]

    def plot_images(image_paths, titles, title_prefix):
        total = len(image_paths)
        cols = min(total, 5)
        rows = math.ceil(total / cols)
        plt.figure(figsize=(3.5 * cols, 3.5 * rows))
        for i, (img_path, title) in enumerate(zip(image_paths, titles)):
            img = Image.open(img_path)
            plt.subplot(rows, cols, i + 1)
            plt.imshow(img)
            plt.axis('off')
            plt.title(f"{title_prefix}: {title}", fontsize=9)
        plt.tight_layout()
        plt.show()

    # 1. Input image
    plot_images([input_image_path], [os.path.basename(os.path.dirname(input_image_path))], "Input")

    # 2. Predicted class samples
    pred_images = get_sample_images(predicted_class)
    plot_images(pred_images, [predicted_class] * n_samples, "Predicted")

    # 3. True class samples
    true_images = get_sample_images(true_class)
    plot_images(true_images, [true_class] * n_samples, "True")

In [None]:
filter_fn = lambda s: (s['correct'] == 0) & (s['prediction_level'] == 0) & (s['confidence'] > 0.7) & (s['level'] == 0)
subset = summary[filter_fn(summary)]

In [None]:
row = subset.sample(n=1).iloc[0]

print(row)

display_prediction_and_samples(
    input_image_path=row['filename'],
    predicted_class=str(row['prediction']),
    true_class=str(row['label']),
    dataset_root="/home/george/codes/lepinet/data/lepi/images",
    n_samples=10)

## Version 2

In [41]:
prd_path = Path("C:\\Users\\au761367\\OneDrive - Aarhus universitet\\Codes\\python\\lepinet\\data\\preds\\flemming_resized.csv" )
# lbl_path = "/home/george/codes/lepinet/data/flemming/normalized.csv"
# prd_lbl_path = "/home/george/codes/lepinet/data/flemming/preds_04-lepi-prod_model1_label.csv" 
# img_path = "/home/george/codes/lepinet/data/flemming/images"
# name2id_path = "/home/george/codes/lepinet/data/flemming/name2id.csv"
# hier_path = "/home/george/codes/lepinet/data/lepi/hierarchy_all.json"

In [None]:
# read df
df=pd.read_csv(prd_path)

# add instance_id column
df['instance_id'] = np.repeat(np.arange(len(df)//3),3)

# invert level 0 and level 2
df['level'] = df['level'].replace({0: 2, 2: 0})

# add threshold column
df['threshold'] = 0.5

# add label column
df['label'] = df['filename'].apply(lambda x: Path(x).parts[-2])

# save new df
df.to_csv(prd_path.with_stem(prd_path.stem+"_reordered"), index=False)