## LMDB Datasets Dev notebook.ipynb

Added on: Sunday April 18th, 2022  
Adapted by: Jacob Alexander Rose  

In [2]:
%load_ext autoreload
%autoreload 2


import lmdb

from imutils.ml.utils.etl_utils import ETL
from omegaconf import DictConfig, OmegaConf
import os
from rich import print as pp
import hydra


import numpy as np
import pyarrow as pa
pa.__version__

import inspect
from tqdm.auto import tqdm
import pandas as pd

from imutils.ml.utils.template_utils import get_logger
import logging

logger = get_logger(name=__name__, level=logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
os.remove

In [4]:

tbl_str = """Time Period	Votes Eligible	Missed Votes	Percent	Percentile
2001 Jan-Mar	63	0	0.0%	0th
2001 Apr-Jun	157	0	0.0%	0th
2001 Jul-Sep	68	3	4.4%	74th
2001 Oct-Dec	92	0	0.0%	0th
2002 Jan-Mar	59	1	1.7%	34th
2002 Apr-Jun	107	0	0.0%	0th
2002 Jul-Sep	61	1	1.6%	46th
2002 Oct-Nov	26	0	0.0%	0th
2003 Jan-Mar	112	0	0.0%	0th
2003 Apr-Jun	150	0	0.0%	0th
2003 Jul-Sep	108	1	0.9%	46th
2003 Oct-Nov	89	3	3.4%	78th
2004 Jan-Mar	64	0	0.0%	0th
2004 Apr-Jun	88	2	2.3%	66th
2004 Jul-Sep	42	10	23.8%	96th
2004 Oct-Dec	22	3	13.6%	81st
2005 Jan-Mar	81	11	13.6%	97th
2005 Apr-Jun	89	0	0.0%	0th
2005 Jul-Sep	76	0	0.0%	0th
2005 Oct-Dec	120	1	0.8%	45th
2006 Jan-Mar	83	2	2.4%	68th
2006 Apr-Jun	107	1	0.9%	34th
2006 Jul-Sep	73	1	1.4%	60th
2006 Nov-Dec	16	0	0.0%	0th
2007 Jan-Mar	126	3	2.4%	63rd
2007 Apr-Jun	112	3	2.7%	74th
2007 Jul-Sep	119	26	21.8%	93rd
2007 Oct-Dec	85	71	83.5%	99th
2008 Jan-Mar	85	38	44.7%	97th
2008 Apr-Jun	77	60	77.9%	98th
2008 Jul-Sep	47	7	14.9%	92nd
2008 Oct-Dec	6	0	0.0%	0th
2009 Jan-Mar	6	1	16.7%	95th"""

tbl = [row.split("\t") for row in tbl_str.split("\n")[1:]]
headers = [row.split("\t") for row in tbl_str.split("\n")[:1]]
# print(tbl)

df = pd.DataFrame(tbl)
print(headers)

[['Time Period', 'Votes Eligible', 'Missed Votes', 'Percent', 'Percentile']]


In [7]:
dir()

['DictConfig',
 'ETL',
 'In',
 'OmegaConf',
 'Out',
 '_',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'df',
 'exit',
 'get_ipython',
 'get_logger',
 'headers',
 'hydra',
 'inspect',
 'lmdb',
 'logger',
 'logging',
 'np',
 'os',
 'pa',
 'pd',
 'pp',
 'quit',
 'tbl',
 'tbl_str',
 'tqdm']

In [5]:
df = df.astype({1:int, 2: int})

In [18]:
df.loc[:,2].sum()/df.loc[:,1].sum()


0.09518348623853211

In [25]:
import logging

__file__ = os.getcwd()
logger = logging.getLogger(__file__)
logger.info(f"Using HERBARIUM_ROOT_DEFAULT location: {os.environ['HERBARIUM_ROOT_DEFAULT']}")

# dir(logging)

In [2]:
# config_path = os.path.dirname(imutils.ml.BASE_ML_CONF_PATH)
# config_name = os.path.basename(imutils.ml.BASE_ML_CONF_PATH)


dataset_cfg_path = "/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/imutils/ml/conf/data/datasets/extant_leaves_family_10_512_dataset.yaml"

config_path = os.path.dirname(dataset_cfg_path)
config_name = os.path.basename(dataset_cfg_path)

overrides = []

print(config_path, config_name)

cfg = ETL.init_structured_config(config_name = config_name,
                                 config_path = config_path,
                                 job_name = "LMDB Dataset Creation",
                                 dataclass_type= None,
                                 overrides=overrides,
                                 cfg = None)

/media/data_cifs/projects/prj_fossils/users/jacob/github/image-utils/imutils/ml/conf/data/datasets extant_leaves_family_10_512_dataset.yaml


In [3]:
# cfg = OmegaConf.to_container(cfg, resolve=True)

datasets = {}
for subset in ["train", "val", "test"]:
        
    hydra.utils.log.info(f"Instantiating <{cfg.data.datasets[subset]._target_}>")
    datasets[subset] = hydra.utils.instantiate(
        cfg.data.datasets[subset], _recursive_=False, output_image_type = np.ndarray
    )
    datasets[subset].setup()


pp(OmegaConf.to_container(cfg, resolve=True))

In [12]:
# from imutils.ml.data.lmdb.convert_lmdb_tools import *
from imutils.ml.data.lmdb import convert_lmdb_tools
from imutils.ml.data.lmdb.folder2lmdb import ImageLMDBDataset

In [8]:
%%time

lmdb_save_dir = os.path.join(cfg.data.catalog_dir, "lmdb_data")
os.makedirs(lmdb_save_dir, exist_ok=True)

for subset in ["train", "val", "test"]:
    data = datasets[subset]
    # print(f"Starting on subset: {subset} with {len(data)}")

    convert_lmdb_tools.generate_lmdb_dataset(dataset=data,
                                             save_dir=lmdb_save_dir,
                                             name=subset,
                                             collate_fn=lambda x: x,
                                             num_workers=8,
                                             max_size_rate=1.0,
                                             write_frequency=1000,
                                             pbar_position=0)



'/media/data_cifs/projects/prj_fossils/users/jacob/data/leavesdb-v1_1/Extant_Leaves_family_10_512/splits/splits=(0.5,0.2,0.3)/lmdb_data'

In [13]:
lmdb_save_dir = os.path.join(cfg.data.catalog_dir, "lmdb_data")

train_data = ImageLMDBDataset(db_dir=lmdb_save_dir,
                       subset="train",
                       transform=None,
                       target_transform=None, 
                       backend='cv2')

In [14]:
len(train_data)

11797

In [17]:
train_data[0][0].shape

torch.Size([3, 512, 512])

In [18]:
from torch.utils.data import DataLoader
data_loader = DataLoader(train_data, num_workers=4)


In [19]:
%%time

for batch in tqdm(iter(data_loader), total=len(train_data)):
    pass

  0%|          | 0/11797 [00:00<?, ?it/s]

CPU times: user 21.3 s, sys: 11.1 s, total: 32.5 s
Wall time: 9min 56s


In [20]:
%%time

train_data = datasets["train"]
data_loader = DataLoader(train_data, num_workers=4)


for batch in tqdm(iter(data_loader), total=len(train_data)):
    pass

  0%|          | 0/11797 [00:00<?, ?it/s]

CPU times: user 17.2 s, sys: 6.32 s, total: 23.5 s
Wall time: 2min 25s


In [22]:
import torch

ckpt_path = "/media/data_cifs/projects/prj_fossils/users/jacob/experiments/2022/herbarium2022/hydra_experiments/2022-04-01/21-13-25/ckpts/epoch=22-val_loss=1.316-val_macro_F1=0.720/model_weights.ckpt"

ckpt = torch.load(ckpt_path)

ckpt.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'hparams_name', 'hyper_parameters', 'hparams_type'])

In [23]:
ckpt["state_dict"].keys()

odict_keys(['net.backbone.conv1.weight', 'net.backbone.bn1.weight', 'net.backbone.bn1.bias', 'net.backbone.bn1.running_mean', 'net.backbone.bn1.running_var', 'net.backbone.bn1.num_batches_tracked', 'net.backbone.layer1.0.conv1.weight', 'net.backbone.layer1.0.bn1.weight', 'net.backbone.layer1.0.bn1.bias', 'net.backbone.layer1.0.bn1.running_mean', 'net.backbone.layer1.0.bn1.running_var', 'net.backbone.layer1.0.bn1.num_batches_tracked', 'net.backbone.layer1.0.conv2.weight', 'net.backbone.layer1.0.bn2.weight', 'net.backbone.layer1.0.bn2.bias', 'net.backbone.layer1.0.bn2.running_mean', 'net.backbone.layer1.0.bn2.running_var', 'net.backbone.layer1.0.bn2.num_batches_tracked', 'net.backbone.layer1.0.conv3.weight', 'net.backbone.layer1.0.bn3.weight', 'net.backbone.layer1.0.bn3.bias', 'net.backbone.layer1.0.bn3.running_mean', 'net.backbone.layer1.0.bn3.running_var', 'net.backbone.layer1.0.bn3.num_batches_tracked', 'net.backbone.layer1.0.se.fc1.weight', 'net.backbone.layer1.0.se.fc1.bias', 'net.b