In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
from PIL import Image
from fastai.vision.all import *
from pprint import pprint
from inspect import getmembers, getmembers_static
from sklearn.metrics import f1_score

In [None]:
parquet_path=Path("/home/george/codes/lepinet/data/mini/0013397-241007104925546_processing_metadata_postprocessed.parquet")
images_path=Path("/home/george/codes/lepinet/data/mini/images")
root_path=Path("/home/george/codes/lepinet/data/mini")
export_path=Path("/home/george/codes/lepinet/data/mini/models")

In [None]:
df=pd.read_parquet(parquet_path)

In [None]:
model_path = export_path / "00_lepi_mini_model2"

learn = load_learner(model_path)
learn.model.eval().to("cuda")

In [None]:
def prepare_df(df, remove_in=[], keep_in=[]):
    # Filter out 'test_ood' rows and 'test_in' rows
    if len(remove_in)>0:
        df = df[~df['set'].isin(remove_in)]
    if len(keep_in)>0:
        df = df[df['set'].isin(keep_in)]
    def generate_image_path(row):
        return Path(str(row['speciesKey'])) / row['filename']

    # Apply the function to create the image paths
    df['image_path'] = df.apply(generate_image_path, axis=1)
    # Add a column to specify whether the row is for training or validation
    df['is_valid'] = df['set'] == '0'
    # Define the hierarchical levels
    hierarchy_levels = ["familyKey", "genusKey", "speciesKey"]

    # Create a function to extract the labels at different hierarchy levels
    def get_hierarchy_labels(row):
        return ' '.join(map(str, [row[level] for level in hierarchy_levels]))

    # Add a column with hierarchy labels
    df['hierarchy_labels'] = df.apply(get_hierarchy_labels, axis=1)
    # Keep only the columns needed for ImageDataLoaders
    df = df[['image_path', 'hierarchy_labels', 'is_valid']]
    return df

df_val = prepare_df(pd.read_parquet(parquet_path), keep_in=["0"])
df_train = prepare_df(pd.read_parquet(parquet_path), remove_in=["test_ood"])
df_ood = prepare_df(pd.read_parquet(parquet_path), keep_in=["test_ood"])
df_all = prepare_df(pd.read_parquet(parquet_path))

## Result analysis

In [None]:
df_val.head()

In [None]:
# test on one image
pred=learn.predict(images_path/df_val["image_path"].iloc[0])
pred_classes, pred_one_hot, pred_proba = pred
pred_classes, type(pred_classes)

In [None]:
targs=df_val["hierarchy_labels"].iloc[0].split(" ")
targs

In [None]:
f1_macro = F1ScoreMulti(thresh=0.5, average='macro')
f1_macro(pred_classes, targs)

In [None]:
dls = ImageDataLoaders.from_df(
    df_train,
    images_path,
    valid_col='is_valid',
    label_delim=' ',
    item_tfms=Resize(460),
    batch_tfms=aug_transforms(size=224))

In [None]:
getmembers_static(dls)

In [None]:
getmembers_static(dls.loaders[0])

In [None]:
type(dls.valid)

In [None]:
# Make sure that one_batch give the same batch and is not randomly generated.
for i in range(2):
    batch=dls.valid.one_batch()
    pprint(batch[0][0].sum())

In [None]:
len(batch)

In [None]:
batch_in, batch_targs = batch
batch_in.shape, batch_targs.shape

In [None]:
learn.validate(dl=dls.valid)


* F1 macro on the validation set
* F1 macro on the test set

In [None]:
f1_score(np.array([[0,1]]), np.array([[0,1]]), average='macro')

### Evaluation on Out-of-distribution species

The evaluation function must be able to deal with two different vocab, one 
for the testing set and one for the training set.

In [None]:
def define_vocab(df):
    vocab=[]
    for i, row in df.iterrows():
        vocab += row["hierarchy_labels"].split()
    vocab = sorted(np.unique(vocab).tolist())
    return vocab

test_eq(define_vocab(df_train), learn.dls.vocab)

In [None]:
len(define_vocab(df_val)), len(define_vocab(df_train)), len(define_vocab(df_ood)), len(define_vocab(df_all))

In [None]:
learn.dls.vocab.o2i

In [None]:
# class OODCallback(Callback):b
#     run_valid = True
#     def after_pred(self,)

In [None]:
learn.predict(images_path/df_all["image_path"].iloc[0])

In [None]:
learn.dls.test_dl()

In [None]:
df_all