## refactoring_datasets_datamodules_22-05-18.ipynb

Added on: Wednesday May 18th, 2022  
Adapted by: Jacob Alexander Rose  

In [None]:
%load_ext autoreload
%autoreload 2


from imutils.ml.utils.etl_utils import ETL
from omegaconf import DictConfig, OmegaConf
import os
from rich import print as pp
import hydra


import numpy as np
from typing import *
import inspect
from tqdm.auto import tqdm
import pandas as pd
from pathlib import Path
import logging
from imutils.catalog_registry import available_datasets


logger = get_logger(name=__name__, level=logging.INFO)

In [None]:
import imutils

from imutils.ml.data.datamodule import *

from imutils.ml.data.datamodule import ExtantLeavesDataModuleConfig, Herbarium2022DataModuleConfig

import dataclasses
from dataclasses import dataclass

In [None]:
herb_cfg = Herbarium2022DataModuleConfig()
extant_cfg = ExtantLeavesDataModuleConfig()

pp(herb_cfg)

In [None]:
herb_dm = Herbarium2022DataModule()

extant_dm = ExtantLeavesDataModule()

In [None]:
dir(herb_dm)

In [None]:
from imutils.ml.aug.image.images import (instantiate_transforms,
										 DEFAULT_CFG as DEFAULT_TRANSFORM_CFG)

In [None]:
pp(DEFAULT_TRANSFORM_CFG)

In [None]:
from omegaconf import OmegaConf, DictConfig, ListConfig

cfg_path = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/imutils/ml/conf/aug/light_image_aug.yaml"

DEFAULT_CFG = OmegaConf.load(cfg_path)

DEFAULT_CFG.hp = {}
DEFAULT_CFG.hp["preprocess_size"] = 256
DEFAULT_CFG.hp["resolution"] = 224
DEFAULT_CFG.hp["to_grayscale"] = False
DEFAULT_CFG.hp["num_channels"] = 3

In [None]:
pp(OmegaConf.to_container(DEFAULT_CFG, resolve=True))

In [None]:
herb_cfg = Herbarium2022DataModuleConfig()
extant_cfg = ExtantLeavesDataModuleConfig()

herb_cfg
# pp(cfg)

# dir(extant_cfg)
print("Extant")
pp([(f.name, f.default) for f in dataclasses.fields(extant_cfg)])

print("Herbarium")
pp([(f.name, f.default) for f in dataclasses.fields(herb_cfg)])

@dataclass
class DataModuleConfig:

	catalog_dir: str=None
	label_col: str="family"
	shuffle: bool=True
	seed:int=14
	batch_size: int=128
	num_workers: int=4
	pin_memory: bool=True
	persistent_workers: Optional[bool]=False
	transform_cfg: Optional["Config"]=None
	to_grayscale: bool=False
	num_channels: int=3
	remove_transforms: bool=False



@dataclass
class ExtantLeavesDataModuleConfig(DataModuleConfig):

	catalog_dir: str="/media/data_cifs/projects/prj_fossils/users/jacob/data/leavesdb-v1_1/extant_leaves_family_3_512"
	label_col: str="family"
	splits: Tuple[float]=(0.5,0.2,0.3)


@dataclass
class FossilLeavesDataModuleConfig(DataModuleConfig):

	catalog_dir: str="/media/data_cifs/projects/prj_fossils/users/jacob/data/leavesdb-v1_1/Fossil_family_3_512"
	label_col: str="family"
	splits: Tuple[float]=(0.5,0.2,0.3)


@dataclass
class PNASDataModuleConfig(DataModuleConfig):

	catalog_dir: str="/media/data_cifs/projects/prj_fossils/users/jacob/data/leavesdb-v1_1/PNAS_family_100_512"
	label_col: str="family"
	splits: Tuple[float]=(0.5,0.2,0.3)



############################
############################

@dataclass
class Herbarium2022DataModuleConfig(DataModuleConfig):

	catalog_dir: str="/media/data_cifs/projects/prj_fossils/data/raw_data/herbarium-2022-fgvc9_resize-512/catalogs" #/splits/train_size-0.8"
	label_col: str="scientificName"
	train_size: float=0.8

In [None]:
# dir(extant_cfg)

dir(dataclasses)

In [None]:
dir(extant_cfg)

In [None]:
herb_dm_args = dict(catalog_dir=None,
                    label_col="scientificName",
                    train_size=0.8,
                    smallest_taxon_col="Species",
                    shuffle=True,
                    seed=14,
                    batch_size = 128,
                    num_workers = None,
                    pin_memory=True,
                    persistent_workers=False,
                    train_transform=None,
                    val_transform=None,
                    test_transform=None,
                    transform_cfg=None,
                    to_grayscale=False,
                    num_channels=3,
                    remove_transforms=False,
                    image_reader="default")

In [None]:
extant_dm_args = dict(catalog_dir=None,
                      label_col="family",
                      splits=(0.5,0.2,0.3),
                      smallest_taxon_col="Species",
                      shuffle=True,
                      seed=14,
                      batch_size = 128,
                      num_workers = None,
                      pin_memory=True,
                      persistent_workers=False,
                      train_transform=None,
                      val_transform=None,
                      test_transform=None,
                      transform_cfg=None,
                      to_grayscale=False,
                      num_channels=3,
                      remove_transforms=False,
                      image_reader="default")

In [None]:
set(extant_dm_args) - set(herb_dm_args)

In [None]:
set(herb_dm_args) - set(extant_dm_args)

In [None]:
for k,v in herb_dm_args.items():
    print(f"Herb -> {k}:{v}")
    if k in extant_dm_args:
        print(f"Extant -> {k}:{extant_dm_args[k]}")
    print("-"*20)

In [None]:
dm = Herbarium2022DataModule(catalog_dir: Optional[str]=None,
				 label_col="scientificName",
				 train_size=0.8,
				 smallest_taxon_col: str="Species",
				 shuffle: bool=True,
				 seed=14,
				 batch_size: int = 128,
				 num_workers: int = None,
				 pin_memory: bool=True,
				 persistent_workers: Optional[bool]=False,
				 train_transform=None,
				 val_transform=None,
				 test_transform=None,
				 transform_cfg=None,
				 to_grayscale: bool=False,
				 num_channels: int=3,
				 remove_transforms: bool=False,
				 image_reader: Callable="default", #Image.open,

In [None]:
dm = ExtantLeavesDataModule(catalog_dir: Optional[str]=None,
				 label_col="family",
				 splits: Tuple[float]=(0.5,0.2,0.3),
				 smallest_taxon_col: str="Species",
				 shuffle: bool=True,
				 seed=14,
				 batch_size: int = 128,
				 num_workers: int = None,
				 pin_memory: bool=True,
				 persistent_workers: Optional[bool]=False,
				 train_transform=None,
				 val_transform=None,
				 test_transform=None,
				 transform_cfg=None,
				 to_grayscale: bool=False,
				 num_channels: int=3,
				 remove_transforms: bool=False,
				 image_reader: Callable="default", #Image.open,
				 **kwargs