In [None]:
import csv
import os
import random
from collections import defaultdict
from os import listdir
from os.path import isdir, join
from pathlib import Path
from pprint import pprint
from PIL import Image

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from mini_trainer import predict
from mini_trainer.builders import BaseBuilder
from mini_trainer.utils.io import ImageLoader
from torch.utils.data import DataLoader
from mini_trainer.utils.logging import BaseResultCollector

In [None]:
data_dir="/home/george/codes/gbifxdl/data/cleaner/images"
csv_path="/home/george/codes/gbifxdl/data/cleaner/images/fold.csv"

In [None]:
# List files
filenames = {d:[] for d in sorted(listdir(data_dir))}
for d in listdir(data_dir):
    sub_dir = join(data_dir, d)
    if isdir(sub_dir):
        for f in listdir(sub_dir):
            filenames[d] += [f]
pprint(filenames["1"][:2])

Explore data.

In [None]:
# Data distribution.

x = list(filenames.keys())
height = [len(v) for v in filenames.values()]

print(x, height)

plt.bar(x, height)

In [None]:
# Imbalance degree.

height = [len(v) for v in filenames.values()]
for i in range(len(height)):
    print(f"Imbalance degree: {height[i]/max(height)*100}%")

Split the dataset into train/val/test

In [None]:
def hold_out(X, y, split_rate, seed=42):
    """
    Splits the dataset into training and test samples in a balanced fashion across classes.
    
    Args:
        X (list): List of inputs (e.g., filenames).
        y (list): List of labels corresponding to X.
        split_rate (float): Fraction of samples to assign as test per class.
        
    Returns:
        list: A binary list of length len(y), where 1 indicates a test (hold-out) sample.
    """
    random.seed(seed)
    n = len(y)
    hold_out_mask = [0] * n
    # Group indices by class label.
    class_indices = defaultdict(list)
    for i, label in enumerate(y):
        class_indices[label].append(i)
    
    # For each class, randomly sample indices for the test set.
    for label, indices in class_indices.items():
        n_samples = len(indices)
        n_test = int(round(n_samples * split_rate))
        test_indices = random.sample(indices, n_test)
        for idx in test_indices:
            hold_out_mask[idx] = 1
            
    return hold_out_mask

def stratified_kfold(X, y, n_splits, hold_out_mask=None, seed=42):
    """
    Performs stratified k-fold assignment on the dataset.
    
    For each class, training samples (or all samples if hold_out_mask is None)
    are randomly shuffled and assigned fold numbers (0 to n_splits-1) in a round-robin fashion.
    If hold_out_mask is provided, samples with hold_out==1 are skipped (their fold is left as -1).
    
    Args:
        X (list): List of inputs.
        y (list): List of labels.
        n_splits (int): Number of folds.
        hold_out_mask (list, optional): List of 0s and 1s indicating training/test. Defaults to None.
    
    Returns:
        list: A list of fold assignments (length equal to len(y)). For training samples, the fold is an integer
              in the range [0, n_splits-1]. Test samples get a fold assignment of -1.
    """
    random.seed(seed)
    n = len(y)
    folds = [-1] * n  # default: -1 for test samples
    # Group training indices by class.
    class_indices = defaultdict(list)
    for i, label in enumerate(y):
        if hold_out_mask is not None:
            if hold_out_mask[i] == 0:
                class_indices[label].append(i)
        else:
            class_indices[label].append(i)
    
    # Assign fold numbers in a round-robin fashion within each class.
    for label, indices in class_indices.items():
        random.shuffle(indices)
        for j, idx in enumerate(indices):
            folds[idx] = j % n_splits
    return folds

def save_stratified_kfold_csv(X, y, split_rate, n_splits, csv_filename):
    """
    Combines hold-out splitting and stratified k-folding, then writes the results to a CSV file.
    
    The CSV will have three columns:
      - filename (taken from X)
      - hold_out (0 for training, 1 for test)
      - fold (fold number for training samples; -1 for test samples)
    
    Args:
        X (list): List of filenames (or input identifiers).
        y (list): List of labels.
        split_rate (float): Fraction of samples per class to mark as test.
        n_splits (int): Number of folds for stratified k-folding.
        csv_filename (str): Path to the CSV file to be saved.
    """
    # First, compute the hold-out mask.
    hold_out_mask = hold_out(X, y, split_rate)
    # Next, compute stratified k-fold assignments (training samples get a fold number, test samples get -1).
    folds = stratified_kfold(X, y, n_splits, hold_out_mask)
    
    # Write the results to a CSV file.
    with open(csv_filename, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filename', 'fold'])
        for filename, fold in zip(X, folds):
            writer.writerow([filename, fold])

def split_dataset(root_dir, split_rate, n_splits, csv_filename, abs_path=False):
    """
    Processes the dataset stored in a folder structure where each subfolder represents a class.
    
    It collects image file paths and their corresponding class labels (the subfolder names), 
    performs a balanced hold-out split and stratified k-folding, and saves the splits into a CSV file.
    
    The resulting CSV contains:
      - filename: full path to the image file
      - hold_out: 1 if the image is assigned as a test sample, 0 otherwise
      - fold: the fold number for training samples (0 to n_splits-1) or -1 for test samples.
    
    Args:
        root_dir (str): Path to the root directory of the dataset.
        split_rate (float): Fraction of images per class to mark as test.
        n_splits (int): Number of folds for stratified k-folding.
        csv_filename (str): Path to the CSV file to be saved.
    """
    X = []
    y = []
    # Each subdirectory in root_dir is assumed to be a class.
    for class_name in os.listdir(root_dir):
        class_dir = os.path.join(root_dir, class_name)
        if os.path.isdir(class_dir):
            # List all files in the class directory.
            for file in os.listdir(class_dir):
                file_path = os.path.join(class_dir, file)
                if os.path.isfile(file_path):
                    if abs_path:
                        X.append(file_path)
                    else:
                        X.append(file)
                    y.append(class_name)
                    
    # Save the stratified k-fold and hold-out splits to CSV.
    save_stratified_kfold_csv(X, y, split_rate, n_splits, csv_filename)
    print(f"CSV with stratified k-fold splits saved to {csv_filename}")

In [None]:
split_dataset(
    root_dir=data_dir,
    split_rate=0.2,
    n_splits=5,
    csv_filename=csv_path
)

First try. Training on the imbalance dataset.

In [None]:
df = pd.read_csv(csv_path)

In [None]:
df.head()

Second try. Use Asger's mini trainer

In [None]:
l=os.listdir("/home/george/codes/gbifxdl/data/lepi_small/images")
al=[os.listdir(os.path.join("/home/george/codes/gbifxdl/data/lepi_small/images",e)) for e in l]
lal=[len(e) for e in al]
l[np.argmax(lal)]

In [None]:
kwargs = predict.cli(description="Mini trainer.",
                     models="efficientnet_v2_s",
                     class_index="/home/george/codes/gbifxdl/data/cleaner/models/class_index.json",
                     weights="/home/george/codes/gbifxdl/data/cleaner/models/efficientnet_v2_s_full_e15.pt")

In [None]:
predict.main(
    input="/home/george/codes/gbifxdl/data/cleaner/images/5",
    output="/home/george/codes/gbifxdl/data/cleaner/tests",
    class_index="/home/george/codes/gbifxdl/data/cleaner/models/class_index.json",
    model_builder_kwargs={
        "model_type":"efficientnet_v2_s",
        "weights":"/home/george/codes/gbifxdl/data/cleaner/models/efficientnet_v2_s_full_e15.pt"
    }
)

In [None]:
main_path = "/home/george/codes/gbifxdl/data/cleaner/images"
all_folders = [os.path.join(main_path,e) for e in os.listdir(main_path)]
for folder in all_folders:
    res=predict.main(
        input=folder,
        output="/home/george/codes/gbifxdl/data/cleaner/tests",
        class_index="/home/george/codes/gbifxdl/data/cleaner/models/class_index.json",
        model_builder_kwargs={
            "model_type":"efficientnet_v2_s",
            "weights":"/home/george/codes/gbifxdl/data/cleaner/models/efficientnet_v2_s_full_e15.pt"
        }
    )
    print(res)

Rewrite predict.main so to take as input:
- image folder
- metadata file

and to output:
- either an edited metadata file.
- or an edited folder (non-1 classes removed).

There could be a loop that takes a batch of filenames/paths and that output the 
model predictions.

In [None]:
class Postprocessing:
    model_list = ['efficientnet_v2_s']
    def __init__(self,
                 class_index:str,
                 batch_size:int,
                 workers:int,
                 device:torch.device,
                 model_type:str,
                 weights_path:str,
                 ):
        f"""
        Parameters
        ----------

        class_index : str
            Path to the class index.
        batch_size : int
        workers : int
        device : torch.device
            Device where the model and the data batches will be placed.
        model_type : str
            One of {self.model_list}.
        weights_path : str
            Path toward the model weights.        
        """
        assert model_type in self.model_list, f"Model should be one of {self.model_list}"

        builder = BaseBuilder()

        extra_model_kwargs, self.extra_dataloader_kwargs = builder.spec_model_dataloader(
            path=class_index, 
            dir=None,
        )

        self.dtype : torch.dtype = getattr(torch, "float16")
        self.device : torch.device = torch.device(device)
        self.model, model_preprocess = builder.build_model( 
            device=self.device, 
            dtype=self.dtype, 
            model_type = model_type,
            weights = weights_path,
            **extra_model_kwargs
        )
        self.model.eval()

        self.image_loader = ImageLoader(
            model_preprocess,
            self.dtype,
            torch.device("cpu")
        )
        self.batch_size = batch_size
        self.workers = workers

    def predict(self, paths):
        """Predict from list of paths."""
        ds = self.image_loader(paths)
        dl = DataLoader(
            ds,
            batch_size=self.batch_size,
            num_workers=self.workers,
            pin_memory=True,
            pin_memory_device=self.device.type,
            shuffle=False,
            drop_last=False
        )

        results=BaseResultCollector(
            training_format=False,
            verbose=False,
            **self.extra_dataloader_kwargs)

        with torch.no_grad():
            for batch_i, batch in tqdm(enumerate(dl), desc="Running inference...", total=np.ceil(len(ds) / self.batch_size), leave=True):
                i = batch_i * self.batch_size
                if len(batch.shape) == 3:
                    batch = batch.unsqueeze(0)
                with torch.autocast(device_type=self.device.type, dtype=self.dtype):
                    prediction = self.model(batch.to(self.device))
                results.collect(
                    paths = paths[i:(i+len(batch))],
                    predictions = prediction
                )

        return results.data
    
    def process(self, )

In [None]:
pp=Postprocessing(
    class_index="/home/george/codes/gbifxdl/data/cleaner/models/class_index.json",
    batch_size=16,
    workers=6,
    # device=torch.device("cuda:0"),
    device="cuda",
    model_type="efficientnet_v2_s",
    weights_path="/home/george/codes/gbifxdl/data/cleaner/models/efficientnet_v2_s_full_e15.pt"
)

In [None]:
f="/home/george/codes/gbifxdl/data/insectnet/images/8049830"
l=os.listdir(f)
al=[os.path.join(f,e) for e in l]

preds=pp.predict(al)

In [None]:
idx=[i for i,p in enumerate(preds["preds"]) if p!='1']
non1 = {k:[p[i] for i in idx] for k,p in preds.items()}

paths, ps, cnfs = non1["paths"], non1["preds"], non1["confs"]

for i,pths in enumerate(paths):
    plt.imshow(Image.open(pths))
    plt.title(f"Pred {ps[i]}, Conf {cnfs[i]}")
    plt.show()