In [9]:
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score
import matplotlib.pyplot as plt
import numpy as np
import json
from pathlib import Path

In [16]:
# load s2 data
root = 'figures/kshot_full_machine_label'
species_id = [24, 177, 193, 147, 195]
species_map = {
    24: "Azadirachta indica",
    177: "Vachellia nilotica",
    193: "Ailanthus excelsa",
    147: "Prosopis cineraria",
    195: "prosopis juliflora"
}

num_instances = {
    24: 15000,
    177: 13000,
    193: 2700,
    147: 5300,
    195: 3100
}

shots = [50000]
shots.extend(range(100, 1000, 100))
shots.extend(range(1000, 5000, 500))
shots.extend(range(5000, 16000, 1000))

f = open('data/kshot_machine_labeled.csv', 'w')
f.write('species,shots,ROC AUC,AUPRC,Accuracy,# human labels BIO-CLIP trained on\n')

for i, id in enumerate(species_id):
    for k in shots:
        if num_instances[id] < k:
            continue

        s = species_map[id]
        file = f'{root}/{k}shot/{s.replace(' ', '_')}/species_{s.replace(' ', '_')}/{s.replace(' ', '_')}_predictions.csv'
        s2 = pd.read_csv(file)

        y_true = s2['true_label']
        y_score = s2['probability']
        roc_auc = roc_auc_score(y_true, y_score)
        auprc = average_precision_score(y_true, y_score)
        acc = np.sum(s2['true_label'] == s2['predicted']) / len(s2['true_label'])

        f.write(f'{s},{k},{roc_auc},{auprc},{acc},10000\n')

f.close()

In [13]:
root = 'figures/s2_shots_human_data'
shots = [0, 1000, 5000, 11700]
species_id = [24, 177, 193, 147, 195]
species_map = {
    24: "Azadirachta indica",
    177: "Vachellia nilotica",
    193: "Ailanthus excelsa",
    147: "Prosopis cineraria",
    195: "prosopis juliflora"
}

f = open('data/fig3_data.csv', 'w')
f.write('species,ROC AUC,AUPRC,Accuracy,# human labels BIO-CLIP trained on\n')

for i, id in enumerate(species_id):
    for k in shots:
        s = species_map[id]
        file = f'{root}/{k}/{s.replace(' ', '_')}/species_{s.replace(' ', '_')}/{s.replace(' ', '_')}_predictions.csv'
        file = Path(file)
        if not file.exists():
            continue

        s2 = pd.read_csv(file)

        y_true = s2['true_label']
        y_score = s2['probability']
        roc_auc = roc_auc_score(y_true, y_score)
        auprc = average_precision_score(y_true, y_score)
        acc = np.sum(s2['true_label'] == s2['predicted']) / len(s2['true_label'])

        f.write(f'{s},{roc_auc},{auprc},{acc},{k}\n')

f.close()