In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score
import matplotlib.pyplot as plt
import numpy as np
import json

In [22]:
truth = pd.read_csv('true_freq.csv')

species = json.load(open('species_vector_mapping.json'))['species_order']
s = [
    (len(truth[truth['species_name'] == sp]) / len(truth), sp)
    for sp in species
]
s.sort()
s.reverse()

df = pd.DataFrame()
df['Species'] = [p[1] for p in s]
df['True Frequency'] = [p[0] for p in s]

                      Species  True Frequency
0          Azadirachta indica        0.333106
1          Vachellia nilotica        0.227304
2          Prosopis cineraria        0.104437
3           Ailanthus excelsa        0.095563
4          prosopis juliflora        0.044369
..                        ...             ...
205             Albizia amara        0.000000
206         Alangium chinense        0.000000
207          Aglaia tomentosa        0.000000
208      Adenanthera pavonina        0.000000
209  Acrocarpus fraxinifolius        0.000000

[210 rows x 2 columns]


In [24]:
# load s2 data
root = 'figures/fig2_bioclip'
kshots = [0, 1000, 5000, 11700]

for k in kshots:
    file = f'{root}/subsample_{k}/bio_clip/gsv/100000000000000shots/seed1/predictions/eval_predictions.csv'
    gsv = pd.read_csv(file)

    col = []
    for i, s in enumerate(df['Species']):
        pct = len(gsv[gsv['predicted_class'] == s]) / len(gsv)
        col.append(pct)

    df[f'{k} Shot Frequency'] = col

In [27]:
df.to_csv('data/annotation_efficiency.csv', index=False)

In [33]:
truth = pd.read_csv('true_freq.csv')
vector_data = json.load(open('species_vector_mapping.json'))


print(truth['image_filename'])
actual = {
    truth['image_filename'][idx]: truth['species_name'][idx]
    for idx in truth.index
}

print(actual)
print(len(actual))

0       tree_Pali_Pali_0_10_low_density_far_north_0_20...
1       tree_Pali_Pali_0_72_low_density_near_south_0_2...
2       tree_Alwar_Alwar_0_118_low_density_near_southe...
3       tree_Pali_Pali_0_117_low_density_far_south_0_2...
4       tree_Pali_Pali_0_117_low_density_near_west_0_2...
                              ...                        
1460    tree_Sikar_Sikar_296_1_low_density_near_northe...
1461    tree_Sikar_Sikar_305_1_low_density_near_north_...
1462    tree_Sikar_Sikar_306_1_low_density_near_north_...
1463    tree_Sikar_Sikar_313_0_low_density_near_south_...
1464    tree_Sikar_Sikar_393_0_low_density_near_north_...
Name: image_filename, Length: 1465, dtype: object
{'tree_Pali_Pali_0_10_low_density_far_north_0_20250526_055530_617380.jpg': 'Aegle Marmelos', 'tree_Pali_Pali_0_72_low_density_near_south_0_20250526_055625_853991.jpg': 'Azadirachta indica', 'tree_Alwar_Alwar_0_118_low_density_near_southeast_0_20250526_055706_494863.jpg': 'Azadirachta indica', 'tree_Pali_Pali_0_

In [58]:
kshots = [0, 1000, 5000, 11700]

f = open('data/annotation_efficiency.csv', 'w')
f.write("species,k-shots,confidence threshold,left after filtering,percent true in filtered\n")

for k in kshots:
    guesses = json.load(open(f'guesses/guesses_{k}.json'))
    guesses_cleaned = {
        k[k.index('/keep_test/')+11:]: v
        for k, v in guesses.items()
    }

    species_ids = [23, 176, 192, 146, 194]
    for id in species_ids:
        for t in range(47, 70, 1):
            thresh = t / 10000
            num = 0
            den = 0
            for key in actual:
                if guesses_cleaned[key][id] >= thresh:
                    den += 1
                    if actual[key] == vector_data['species_order'][id]:
                        num += 1

            if den > 0:
                f.write(f'{vector_data['species_order'][id]},{k},{thresh},{den},{num / den}\n')

f.close()