In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
from PIL import Image
from fastai.vision.all import *

In [None]:
parquet_path=Path("/home/george/codes/lepinet/data/mini/0013397-241007104925546_processing_metadata_postprocessed.parquet")
images_path=Path("/home/george/codes/lepinet/data/mini/images")
root_path=Path("/home/george/codes/lepinet/data/mini")
export_path=Path("/home/george/codes/lepinet/data/mini/models")

## First model training

In [None]:
df=pd.read_parquet(parquet_path)

In [None]:
len(df)

In [None]:
row = df.iloc[0]
row.keys()

In [None]:
image_path = images_path / row["speciesKey"] / row["filename"]

In [None]:
image_path, os.path.isfile(image_path)

In [None]:
image = Image.open(image_path)

In [None]:
(sum(df['set'].isin(['test_ood', '0'])), 
sum(df['set'].isin(['test_ood'])),
sum(df['set'].isin(['0']))
)

In [None]:
def prepare_df(df, remove_in=[], keep_in=[]):
    # Filter out 'test_ood' rows and 'test_in' rows
    if len(remove_in)>0:
        df = df[~df['set'].isin(remove_in)]
    if len(keep_in)>0:
        df = df[df['set'].isin(keep_in)]
    def generate_image_path(row):
        return Path(str(row['speciesKey'])) / row['filename']

    # Apply the function to create the image paths
    df['image_path'] = df.apply(generate_image_path, axis=1)
    # Add a column to specify whether the row is for training or validation
    df['is_valid'] = df['set'] == '0'
    # Define the hierarchical levels
    hierarchy_levels = ["familyKey", "genusKey", "speciesKey"]

    # Create a function to extract the labels at different hierarchy levels
    def get_hierarchy_labels(row):
        return ' '.join(map(str, [row[level] for level in hierarchy_levels]))

    # Add a column with hierarchy labels
    df['hierarchy_labels'] = df.apply(get_hierarchy_labels, axis=1)
    # Keep only the columns needed for ImageDataLoaders
    df = df[['image_path', 'hierarchy_labels', 'is_valid']]
    return df

df=prepare_df(df.copy(), remove_in=['test_ood'])

In [None]:
df.head()

In [None]:
dls = ImageDataLoaders.from_df(
    df,
    images_path,
    valid_col='is_valid',
    label_delim=' ',
    item_tfms=Resize(460),
    batch_tfms=aug_transforms(size=224))

In [None]:
dls.show_batch()

In [None]:
f1_macro = F1ScoreMulti(thresh=0.5, average='macro')
f1_macro.name = 'F1(macro)'
f1_samples = F1ScoreMulti(thresh=0.5, average='samples')
f1_samples.name = 'F1(samples)'
learn = vision_learner(dls, resnet50, metrics=[partial(accuracy_multi, thresh=0.5), f1_macro, f1_samples])

In [None]:
res=learn.lr_find()

In [None]:
res.valley

In [None]:
learn.fine_tune(10, 2e-2)

In [None]:
learn.show_results()

In [None]:
# Save the model
os.makedirs(export_path, exist_ok=True)

model_path = export_path / "00_lepi_mini_model1"
learn.export(model_path)

In [None]:
!ls -alh /home/george/codes/lepinet/data/mini/models

In [None]:
model_path = export_path / "00_lepi_mini_model1"

learn = load_learner(model_path)

### I need to specify the vocab of the MultiCategoryBlock

So I need to go down in the layered architecture

In [None]:
# Let's redefine the dataloader

datablock = DataBlock(
    blocks=(ImageBlock, MultiCategoryBlock(vocab=list(learn.dls.vocab))),
    splitter=ColSplitter(),
    get_x=ColReader(0, pref=images_path),
    get_y=ColReader(1, label_delim=' '),
    item_tfms=Resize(460),
    batch_tfms=aug_transforms(size=224)
)