## EDA_calc_image_stats notebook

Created on: Saturday April 9th, 2022  
Created by: Jacob Alexander Rose  

In [None]:
%load_ext autoreload
%autoreload 2

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"


from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from pathlib import Path
from icecream import ic
from rich import print as pp
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# from imutils.big.datamodule import Herbarium2022DataModule, Herbarium2022Dataset
from imutils.ml.data.datamodule import Herbarium2022DataModule, Herbarium2022Dataset
from imutils.ml.utils.etl_utils import ETL

import pytorch_lightning as pl
from torchvision import transforms as T
import argparse
import imutils
from hydra.experimental import compose, initialize, initialize_config_dir
import hydra
from omegaconf import DictConfig, OmegaConf
from typing import *
default_reader = None
import torchmetrics

from imutils.ml.models.pl import classifier

from imutils.ml.utils.experiment_utils import configure_callbacks, configure_loggers, configure_trainer

import matplotlib.pyplot as plt
import wandb
import imutils
from imutils.ml.data.datamodule import *
from imutils.ml.utils.etl_utils import ETL
from omegaconf import DictConfig, OmegaConf

from rich import print as pp

## Load test config

In [None]:
# overrides = ["aug@data.datamodule.transform_cfg=auto_image_aug",
#              "data/datamodule@data=extant_leaves_family_10_512_datamodule",
#              "model_cfg.backbone.name=resnext50_32x4d"]

overrides = ["data/datamodule@data=extant_leaves_family_10_512_datamodule",
             "model_cfg.backbone.name=resnext50_32x4d"]

# overrides = ["data/datamodule@data=herbarium2022-res_512_datamodule",
#           "model_cfg.backbone.name=resnext50_32x4d"]
          # "+train.pl_trainer.limit_train_batches=2",
          # "hp.batch_size=16",
          # "hp.resolution=224",
          # "+train.pl_trainer.limit_val_batches=2",
          # "train.pl_trainer.log_every_n_steps=10",
          # "train.pl_trainer.devices=1",
          # "train.pl_trainer.strategy=null",
          # 'model_cfg/loss=class-balanced-ce-loss',
          # "model_cfg.loss.beta=0.99",
          # "data.datamodule.transform_cfg.skip_augmentations=true"],

In [None]:
config_path = os.path.dirname(imutils.ml.BASE_ML_CONF_PATH)
config_name = os.path.basename(imutils.ml.BASE_ML_CONF_PATH)

print(config_path, config_name)

cfg = ETL.init_structured_config(config_name = config_name,
                                 config_path = config_path,
                                 job_name = "demo",
                                 dataclass_type= None,
                                 overrides=overrides,
                                 cfg = None)

OmegaConf.set_struct(cfg, False)
# Hydra run directory
try:
    hydra_dir = Path(HydraConfig.get().run.dir)
except Exception as e:
    print(e)
    hydra_dir = os.getcwd()


hydra_dir = cfg.core.experiments_root_dir #"/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17"


if not cfg.get("hydra"):
    cfg.update({"hydra":{"run":{"dir":hydra_dir}},
                "run_output_dir":hydra_dir})
    print(cfg.hydra.run.dir)

# print(OmegaConf.to_yaml(cfg.hydra))#, resolve=True, sort_keys=True))
pp(OmegaConf.to_container(cfg, resolve=True))

# transform_cfg = OmegaConf.to_container(cfg.data.datamodule.transform_cfg.train, resolve=True)

# import albumentations as A

# transforms = []
# for transform_step in transform_cfg:
#     transforms.append(
#         hydra.utils.instantiate(transform_step)
#     )
    
# pp(transforms)

In [None]:
hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>")
datamodule = hydra.utils.instantiate(
    cfg.data.datamodule, _recursive_=False
)

pp(datamodule.cfg)

In [None]:
import torch
import cv2

import numpy as np
import numpy.ma as ma
from skimage.util import img_as_ubyte

def threshold_image(img: np.ndarray, threshold: float) -> np.ndarray:
    return ma.masked_greater(img, threshold)
    # return ma.masked_where(img > threshold, img)

def unnormalize(img):
    return (img - img.min()) / (img.max() - img.min())

def image_stats(img: np.ndarray) -> str:
    
    return f"{img.mean()=:.3f}, {img.std()=:.3f}, {img.min()=:.3f}, {img.max()=:.3f}, {img.dtype=}, {img.shape=}"
    

In [None]:
def plot_with_hist_channel(image):#, channel):
    
    channels = ["red", "green", "blue"]
    
    # if isinstance(channel, str):
    #     channel_idx = channels.index(channel)
    # else:
    #     channel_idx = channel
    if image.ndim == 2:
        channels = ["Black&White"]
        color = "orange"
        label = "grayscale"
    
    
    alpha=1/len(channels)
    # fig, (ax1, ax2) = plt.subplots(
    #     ncols=2, figsize=(18, 6)
    # )  # , sharex=True, sharey=True)
    fig, (ax1, ax2, ax3) = plt.subplots(
        ncols=3, figsize=(27, 6)
    )  # , sharex=True, sharey=True)
    
    
    
    ax1.imshow(image)
    stats_label = image_stats(image)
    ax1.legend(stats_label)
    ax1.axis("off")
    ax1.set_title("Normalized image")
    
    ax2.imshow(unnormalize(image))
    ax2.axis("off")
    ax2.set_title("Saturation-rescaled image")

    extracted_channel = image
    for channel_idx in range(len(channels)):
    
        if image.ndim == 3:
            color = channels[channel_idx]
            label=color
            extracted_channel = image[:, :, channel_idx]
        print(extracted_channel.shape, extracted_channel.ravel().shape)
        print(f"color: {color}")

        ax3.hist(extracted_channel.ravel(), bins=256, color=[color], alpha=alpha, label=label)
        # ax2.set_title(f"{channels[channel_idx]} histogram")
        
    ax3.legend()


In [None]:
image_stats(datamodule.train_dataset.fetch_item(25)[0])

image_stats(datamodule.train_dataset[25][0])

fig, ax = plt.subplots(1,2, figsize=(18,8), sharex=True, sharey=True)

idx =33

img = datamodule.train_dataset.fetch_item(idx)[0]
aug_img = datamodule.train_dataset[idx][0].permute(1,2,0).numpy()

# img_tensor = torch.from_numpy(image).permute(2,0,1)
print(img.shape)
# aug_image = augs(img_tensor).permute(1,2,0).numpy()
print(aug_img.shape)

ax[1].imshow(aug_img)
ax[1].set_title("augmented")
ax[0].imshow(img)
ax[0].set_title("original")



from skimage.color import rgb2gray

plot_with_hist_channel(image=img)

img2 = rgb2gray(img)
plot_with_hist_channel(image=img2)

threshold = img2.mean()
img3 = threshold_image(img2, threshold)

plot_with_hist_channel(image=img3)

## Calculate dataset stats

In [None]:
idx =33

img = datamodule.train_dataset.fetch_item(idx)[0]
# aug_img = datamodule.train_dataset[idx][0].permute(1,2,0).numpy()
image_stats(img)

In [None]:
df = datamodule.train_dataset.df
df

In [None]:
import glob
import numpy as np
from tqdm.auto import tqdm
from joblib import Parallel, delayed

def calculate_img_channel_means(img_path):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    means = [np.mean(img[..., i]) / 255.0 for i in range(3)]
    std =   [np.std(img[..., i]) / 255.0 for i in range(3)]
    return means, std

# images = glob.glob(os.path.join(PATH_DATASET, "train_images", "*", "*", "*.jpg"))
image_filepaths = df.path.values.tolist()

# images += glob.glob(os.path.join(PATH_DATASET, "test_images", "*", "*.jpg"))
clr_mean_std = Parallel(n_jobs=os.cpu_count())(
    delayed(calculate_img_channel_means)(fn) for fn in tqdm(image_filepaths)
)

In [None]:
#######


img_color_mean = pd.DataFrame([c[0] for c in clr_mean_std]).describe()
display(img_color_mean)
img_color_std = pd.DataFrame([c[1] for c in clr_mean_std]).describe()
display(img_color_std)

img_color_mean = list(img_color_mean.T["mean"])
img_color_std = list(img_color_std.T["mean"])
print("Mean:", img_color_mean,"\n", "Std:", img_color_std)

In [None]:
# mask = np.zeros_like(img2, dtype=int)
# mask = np.where(img2>=threshold, 1, 0)
# img3 = img_as_ubyte(unnormalize(img2))[mask]

plt.imshow(mask)

## Continued Training

* Observation:

In [None]:
from imutils.ml.utils.experiment_utils import (configure_callbacks,
                                               configure_loggers,
                                               configure_trainer,
                                               configure_loss_func)
import hydra
if cfg.execution_list.model_fit:

    hydra.utils.log.info(f"Executing train stage: model_fit")

    hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>")
    datamodule = hydra.utils.instantiate(
        cfg.data.datamodule, _recursive_=False
    )
    datamodule.setup()

    loss_func = configure_loss_func(cfg, targets=datamodule.train_dataset.df.y)

# logging.warning("1. Before model, before trainer")
hydra.utils.log.info(f"Instantiating <{cfg.model_cfg._target_}>")
# model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg, _recursive_=False)
model = imutils.ml.models.pl.classifier.LitClassifier(cfg=cfg, #model_cfg=cfg.model_cfg,
                                                      loss_func=cfg.model_cfg.loss)

# logging.warning("2. After model, before trainer")
# ic(torch.cuda.current_device())	
# ic(torch.cuda.get_device_name(0))
wandb_logger = configure_loggers(cfg=cfg, model=model)
callbacks: List[pl.Callback] = configure_callbacks(cfg=cfg.train)	
hydra.utils.log.info(f"Instantiating the Trainer")
pp(OmegaConf.to_container(cfg.train.pl_trainer))
trainer = configure_trainer(cfg,
                            callbacks=callbacks,
                            logger=wandb_logger)
# logging.warning("3. After model, after trainer, before fit")
# ic(torch.cuda.current_device())
num_samples = len(datamodule.train_dataset)
num_classes = cfg.model_cfg.head.num_classes
batch_size = datamodule.batch_size #["train"]
hydra.utils.log.info("Starting training with {} classes across {} images in batches of {} images each.".format(
    num_classes,
    num_samples,
    batch_size))

In [None]:
pp(OmegaConf.to_container(cfg, resolve=True))
results = trainer.fit(model=model, datamodule=datamodule)



### predict

In [None]:
%load_ext autoreload
%autoreload 2

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "7"


from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from pathlib import Path
from icecream import ic
from rich import print as pp
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# from imutils.big.datamodule import Herbarium2022DataModule, Herbarium2022Dataset
from imutils.ml.data.datamodule import Herbarium2022DataModule, Herbarium2022Dataset
from imutils.ml.utils.etl_utils import ETL

import pytorch_lightning as pl
from torchvision import transforms as T
import argparse
import imutils
from hydra.experimental import compose, initialize, initialize_config_dir
import hydra
from omegaconf import DictConfig, OmegaConf
from typing import *
default_reader = None
import torchmetrics

from imutils.ml.models.pl import classifier

from imutils.ml.utils.experiment_utils import configure_callbacks, configure_loggers, configure_trainer


import wandb
import os
from pathlib import Path
# os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"
# api = wandb.Api()
# run = api.run("jrose/herbarium2022/37r673ke")
import imutils
from imutils.ml.utils.etl_utils import ETL
from omegaconf import DictConfig, OmegaConf

from rich import print as pp

In [None]:


# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=10-val_loss=2.834-val_F1=0.384.ckpt"

# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=14-val_loss=2.521-val_F1=0.443.ckpt"

# ckpt_dir = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts"
# os.listdir(ckpt_dir)

In [None]:
from imutils.ml.utils.experiment_utils import (configure_callbacks,
                                               configure_loggers,
                                               configure_trainer,
                                               configure_loss_func)


In [None]:
run_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/experiments/2022/herbarium2022/hydra_experiments/2022-03-29/05-31-45"
ckpt_dir = os.path.join(run_dir, "ckpts")
ckpt_paths = sorted(os.listdir(ckpt_dir))[::-1][:3]   #[:-4:-1]

ckpt_paths = [os.path.join(ckpt_dir, f, "model_weights.ckpt") for f in ckpt_paths]

In [None]:
# run_main.py
overrides = [
    'core.name=\"Experiment #19 (2022-03-29)\"',
    "optim.optimizer.weight_decay=5e-6",
    "hp.batch_size=24",
    "aug@data.datamodule.transform_cfg=medium_image_aug_conf",
    "hp.preprocess_size=512",
    "hp.resolution=448",
    "model_cfg.backbone.name=resnext50_32x4d",
    "train.pl_trainer.devices=1",
    "train.pl_trainer.accelerator=gpu",
    "data.datamodule.num_workers=4"
]

In [None]:
config_path = os.path.dirname(imutils.ml.BASE_ML_CONF_PATH)
config_name = os.path.basename(imutils.ml.BASE_ML_CONF_PATH)

print(config_path, config_name)

cfg = ETL.init_structured_config(config_name = config_name,
                                 config_path = config_path,
                                 job_name = "demo",
                                 dataclass_type= None,
                                 overrides = ["train.pl_trainer.strategy=null",
                                              *overrides],
                                 cfg = None)

OmegaConf.set_struct(cfg, False)
# Hydra run directory
try:
    hydra_dir = Path(HydraConfig.get().run.dir)
except Exception as e:
    print(e)
    hydra_dir = os.getcwd()


hydra_dir = cfg.core.experiments_root_dir #"/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17"


if not cfg.get("hydra"):
    cfg.update({"hydra":{"run":{"dir":hydra_dir}},
                "run_output_dir":hydra_dir})
    print(cfg.hydra.run.dir)


In [None]:
ckpt_paths

In [None]:
ckpt_path = ckpt_paths[1]
# os.listdir(ckpt_path)
ckpt_path

In [None]:
datamodule = hydra.utils.instantiate(
    cfg.data.datamodule, _recursive_=False
)
datamodule.setup()

In [None]:
loss_func = configure_loss_func(cfg, targets=datamodule.train_dataset.df.y)

hydra.utils.log.info(f"Instantiating <{cfg.model_cfg._target_}>")

In [None]:
model = imutils.ml.models.pl.classifier.LitClassifier(cfg=cfg,
                                                      loss_func=cfg.model_cfg.loss)

In [None]:
model = model.load_from_checkpoint(ckpt_path);
model.eval();
model.freeze();

In [None]:
test_dataloader = datamodule.test_dataloader()

In [None]:
from tqdm import tqdm

device="cuda"
total=len(test_dataloader)

results = []
for batch_idx, batch in tqdm(enumerate(iter(test_dataloader)), total=total):
    # x, y, metadata = batch
    batch[0] = batch[0].to(device)
    # image_ids = metadata['image_id']

    output = model.predict_step(batch, batch_idx)
    output["y_logit"] = output["y_logit"].cpu().detach()
    results.append(output)


    # results.append({"image_id": image_ids,
    #                 "y_logits": y_logits})

In [None]:
len(results)

In [None]:
import torch

In [None]:
y_logits = torch.cat([torch.argmax(r["y_logit"], -1) for r in results])
image_ids = torch.cat([r["image_id"] for r in results])
len(results[0]["image_id"])

In [None]:
import numpy as np
np.concatenate

In [None]:
# for r in results:



y_preds = np.concatenate([torch.argmax(r["y_logit"], -1).numpy() for r in tqdm(results)])
image_ids = np.concatenate([r["image_id"] for r in results])
len(results[0]["image_id"])

In [None]:
y_pred_labels = datamodule.train_dataset.label_encoder.inv_transform(y_preds)
y_pred_labels

In [None]:
submit = pd.DataFrame({"Id":image_ids,
                      "Predicted":y_preds})
submit.to_csv("2022-04-04_JRose-Exp#19_baseline_herbarium2022_test_predictions_submission.csv",index=False)

In [None]:
import pandas as pd

In [None]:
preds = pd.read_csv("2022-04-04_JRose-Exp#19_baseline_herbarium2022_test_predictions_submission.csv",index_col=None)

In [None]:
preds.shape

In [None]:
label_encoder = datamodule.train_dataset.label_encoder

# out = preds..assign(Predicted_scientificName = preds.apply(lambda x: label_encoder.inv_transform(x.Predicted), axis=1))
out = preds.head(1000).apply(lambda x: label_encoder.inv_transform(x.Predicted)[0], axis=1)

In [None]:
train_df = datamodule.train_dataset.df

test_df = datamodule.test_dataset.df

In [None]:
catalog = df.groupby("scientificName").head(1)

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="Converting predictions back to standard category_ids")

# df.progress_apply

In [None]:
%%time
def get_category_id(query):
    return catalog[catalog.y==query].category_id.item()

category_ids =  preds.assign(Predicted_cat_id = preds.progress_apply(lambda x: get_category_id(query=x.Predicted), axis=1))
# category_ids =  preds.head(1000).apply(lambda x: get_category_id(query=x.Predicted), axis=1)
category_ids

In [None]:
preds_final = category_ids.copy()

In [None]:
preds_final = preds_final.drop(columns=["Predicted"]
                              ).rename(columns={"Predicted_cat_id":"Predicted"})

preds_final

In [None]:
preds_final.to_csv("2022-04-04_JRose-Exp#19_baseline_herbarium2022_test_predictions_submission--fixed-labels.csv",index=False)

In [None]:
test_df.head(1000)

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
idx = 0

img = Image.open(test_df.path[idx])
pred_y = preds.Predicted[idx]
pred_category_id = category_ids[idx]


ax = plt.imshow(img)
plt.title(f"pred_y: {pred_y}, pred_category_id: {pred_category_id}")

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
idx = 0
samples = train_df[train_df.category_id==pred_category_id]

In [None]:
# i=0
fig, ax = plt.subplots(1,4, figsize=(30, 50))
for i in range(4):
    img = Image.open(samples.path.iloc[i])
    ax[i].imshow(img)
    ax[i].set_title(f"image_id: {samples.image_id.iloc[i]}")

# img

In [None]:
img = Image.open(train_df.path[idx])
pred_y = preds.Predicted[idx]
pred_category_id = category_ids[idx]


ax = plt.imshow(img)
plt.title(f"pred_y: {pred_y}, pred_category_id: {pred_category_id}")

In [None]:
pred_ids = preds.head(1000).Id.values


out = test_df[test_df.image_id.apply(lambda x: x in pred_ids)]
out

In [None]:
preds = []
with torch.no_grad():
    for inputs in tqdm(test_dl):
        inputs['pixel_values'] = inputs['pixel_values'].to('cuda')
        outputs = model(**inputs)
        logits = outputs.logits
        preds.extend([x.item() for x in logits.argmax(-1)])
submit = pd.read_csv('../input/herbarium-2022-fgvc9/sample_submission.csv')
submit['Predicted'] = preds
submit.to_csv('beit.csv', index=False)

In [None]:
pp(OmegaConf.to_container(model.cfg, resolve=True))
results = {}
results['val'] = trainer.validate(model, datamodule=datamodule)
pp(results['val'])

# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=07-val_loss=3.338-val_F1=0.313.ckpt"

ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=10-val_loss=2.834-val_F1=0.384.ckpt"

# ckpt = torch.load(ckpt_path)
# print(ckpt.keys())

model = model.load_from_checkpoint(ckpt_path);
model.eval();
model.freeze();

# batch[0] = batch[0].cuda()
# batch[1] = batch[1].cuda()
# x, y, metadata = batch
# x = x.to('cuda')
# y_logits = model(x)
# y_logits_top5 = torch.topk(torch.Tensor(y_logits.cpu()), k=5, dim=1)
# topk = 5
# y_logits_top5_idx = y_logits_top5.indices.numpy()
# labels_k = le.decode_topk(y_logits_top5_idx)
# datamodule.train_dataset.label_encoder.classes_
# y_pred = torch.zeros_like(y_logits_top5.indices)
# topk_labels = np.empty((128,5), dtype="O")
# for k in range(5):
#     labels_k = datamodule.train_dataset.label_encoder.inverse_transform(y_logits_top5.indices[:,k])
#     topk_labels[:,k] = labels_k

## Other

In [None]:
import matplotlib.pyplot as plt

batch = (next(iter(datamodule.train_dataloader())))

x, y = batch[:2]

# import cv2
import torchvision
# read_img = cv2.imread


def plot_imgs(imgs,r=8,c=8,figsize=(20,20)):
    _, axs = plt.subplots(r,c,figsize=figsize)
    axs=axs.flatten()
    for n, ax in enumerate(axs):
        img=imgs[n]
        
        img = (img - img.min()) / (img.max() - img.min())
        
        ax.imshow(torchvision.transforms.functional.to_pil_image(img))
        ax.axis('off')
        
    plt.tight_layout()
    plt.show()
    
    
    
plot_imgs(x)

In [None]:
def plot_layer_stats(self, idx):
    plt,axs = subplots(1, 3, figsize=(15,3))
    plt.subplots_adjust(wspace=0.5)
    for o,ax,title in zip(self.layer_stats(idx),axs,('mean','std','% near zero')):
        ax.plot(o)
        ax.set_title(f"{-1*layer}th layer {title}")
# for layer in range(1,4):
#     plot_layer_stats(learn.activation_stats,-1*layer)

In [None]:
from imutils.ml.models.pl.classifier import LitClassifier

model = LitClassifier(cfg=cfg,
                      loss_func=loss_func)



In [None]:
run_dir = "/media/data_cifs/projects/prj_fossils/users/jacob/experiments/2022/herbarium2022/hydra_experiments/2022-03-29/05-31-45"
ckpt_dir = os.path.join(run_dir, "ckpts")

top_k = 3

ckpt_paths = [os.path.join(ckpt_dir, f) for f in sorted(os.listdir(ckpt_dir))[-top_k:]]
paths = []
for d in ckpt_paths:
    if os.path.isdir(d):
        for f in sorted(os.listdir(d)):
            paths.append(os.path.join(d, f))
    else:
        paths.append(d)
paths

In [None]:
import os
import wandb

os.environ["WANDB_PROJECT"]="herbarium2022"
!set | grep WANDB


artifact = wandb.Artifact("model-weights", "checkpoints")
# Add Files and Assets to the artifact using 
# `.add`, `.add_file`, `.add_dir`, and `.add_reference`
artifact.add_dir(ckpt_dir)
# artifact.add_file(ckpt_path)
artifact.save()




# api = wandb.Api()
# run = api.run("herbarium2022/37r673ke")
# run.upload_file(ckpt_path)
# for path in ckpt_paths:
#     print(f"Uploading file to wandb: {path}")
#     run.upload_file(path)
# run = wandb.init(project=PROJECT_NAME, resume=True)
# run.finish

In [None]:
%load_ext filprofiler

import psutil
from rich import print as pp
print(f"RAM memory % used: {psutil.virtual_memory()[2]}")

In [None]:
from imutils.ml.data.datamodule import Herbarium2022DataModule, Herbarium2022Dataset
import sys

In [None]:
%%filprofile

root_dir = "/media/data_cifs/projects/prj_fossils/data/raw_data/herbarium-2022-fgvc9_resize-512/catalogs"

ds = Herbarium2022Dataset(catalog_dir=root_dir, image_reader="PIL")

ds

In [None]:
pp(dict(ds.df.iloc[0]))

path = ds.df.path[0]

In [None]:
from PIL import Image
# %%filprofile
# img_bytes = img.tobytes()

%%filprofile

def display_obj_size(obj):
    img_mem = sys.getsizeof(obj)
    # img_mem = sys.getsizeof(img_bytes)
    print("img size in memory:")
    print(f"- {img_mem:,} bytes")
    print(f"- {img_mem/1000:,} kb")
    print(f"- {img_mem/1000/1000:,} Mb")
    
    
display_obj_size(img.tobytes())

In [None]:
import numpy as np
import cv2

In [None]:
%%filprofile

take_k = 50
paths = ds.df.path[:take_k]

imgs = []
for i, path in enumerate(paths):
    # img = Image.open(path)
    # img = np.asarray(img)
    with open(path,"rb") as f:
        # imgs.append(f.read())
        img_enc = f.read()
        
    img_buffer = np.frombuffer(img_enc, np.uint8)
    dec_img = cv2.imdecode(img_buffer, cv2.IMREAD_ANYCOLOR)
    img = dec_img[:,:,::-1]
        
    # img = cv2.imread(path)
    # if i < 3:
        # display_obj_size(img.tobytes())
    imgs.append(img)
    # break
    
imgs = np.stack(imgs)
# print(imgs.shape)

In [None]:
imgs.shape

display_obj_size(imgs[0])

# img_buffer = imgs[2]
img_enc = imgs[2]


# img_enc = base64.b64decode(img_b64_enc)
img_buffer = np.frombuffer(img_enc, np.uint8)
dec_img = cv2.imdecode(img_buffer, cv2.IMREAD_ANYCOLOR)
img = img[:,:,::-1]

dec_img.shape

Image.fromarray(dec_img[:,:,::-1])

In [None]:
%memit img_mem = sys.getsizeof(img.tobytes())

print("img size in memory:")
print(f"- {img_mem:,} bytes")
print(f"- {img_mem/1000:,} kb")
print(f"- {img_mem/1000/1000:,} Mb")

In [None]:
from io import BytesIO
img_file = BytesIO()
img.save(img_file, 'png')
img_file_size_png = img_file.tell()
img_file = BytesIO()
img.save(img_file, 'jpeg')
img_file_size_jpeg = img_file.tell()
print("img_file_size png: ", img_file_size_png)
print("img_file_size jpeg: ", img_file_size_jpeg)

In [None]:
import StringIO
output = StringIO.StringIO()
image_output.save(output, 'PNG') #a format needs to be provided
contents = output.getvalue()
output.close()

image_filesize = len(contents)

ram_info = psutil.virtual_memory()

for name, quantity in ram_info._asdict().items():
    if name == "percent":
        print(f"{name}: {quantity/100:.2%}")
    else:
        print(f"{name}: {quantity/1000/1000/1000:2,} GB")
        print(f"{name}: {quantity/1000/1000:2,} MB")
        print(f"{name}: {quantity/1000:.2e} kb")
    print("="*20)

# pp(ram_info)

# total, avail, perc = ram_info[:3]

In [None]:
print(f"{0.000012079806881608064:e}")

In [None]:
print(f"{0.0000027673238836757465:e}")

In [None]:
print(f"{0.000012079806881608064/0.0000027673238836757465:e}")

In [None]:
print(f"{0.0000027673238836757465/0.000012079806881608064}")

In [None]:
!wandb sync --id "37r673ke" -p "herbarium2022" -e "jrose" --include-online "/media/data_cifs/projects/prj_fossils/users/jacob/experiments/2022/herbarium2022/hydra_experiments/2022-03-29/05-31-45/ckpts/*"

In [None]:
(671817 + 167955)

train = 671817 / 24 / 4
val = 167955 / 24 / 4

train
val
train+ val

1/1.10
1/1.3
1/1.6

24/1.10
24/1.3
24/1.6

(24/1.10)**-1
(24/1.3)**-1
(24/1.6)**-1

24/1.62

In [None]:
config_path = os.path.dirname(imutils.ml.BASE_ML_CONF_PATH)
config_name = os.path.basename(imutils.ml.BASE_ML_CONF_PATH)

cfg = ETL.init_structured_config(config_name = config_name,
                                 config_path = config_path,
                                 job_name = "demo",
                                 dataclass_type= None,
                                 overrides = ["data.datamodule.num_workers=4",
                                              "data/datamodule@data=herbarium2022-res_512_datamodule",
                                              # "train.pl_trainer.gpus=4",
                                              "train.pl_trainer.accelerator=gpu",
                                              "model_cfg.backbone.name=resnext50_32x4d"],
                                              # "model_cfg.backbone.name=resnet_50"],
                                              # "model_cfg.backbone.name=xcit_large_24_p16_224"],
                                              # "model_cfg.backbone.name=resnetv2_101x1_bitm"],
                                             # "model_cfg.backbone.name=resnetv2_50"], 
                                 cfg = None)




In [None]:
OmegaConf.set_struct(cfg, False)
# Hydra run directory
try:
    hydra_dir = Path(HydraConfig.get().run.dir)
except Exception as e:
    print(e)
    hydra_dir = os.getcwd()


hydra_dir = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17"


if not cfg.get("hydra"):
    cfg.update({"hydra":{"run":{"dir":hydra_dir}},
                "run_output_dir":hydra_dir})
    print(cfg.hydra.run.dir)

print(OmegaConf.to_yaml(cfg.hydra))#, resolve=True, sort_keys=True))

## Instantiate experiment ingredients with config

In [None]:
from icecream import ic

# OmegaConf.register_new_resolver("int", int)

if cfg.train.deterministic:
    pl.seed_everything(cfg.train.random_seed)

if cfg.train.pl_trainer.fast_dev_run:
    hydra.utils.log.info(
        f"Debug mode <{cfg.train.pl_trainer.fast_dev_run}>. "
        f"Forcing debugger friendly configuration!"
    )
    cfg.train.pl_trainer.gpus = 0
    cfg.data.datamodule.num_workers = 0

try:
    hydra_dir = Path(HydraConfig.get().run.dir)
except Exception as e:
    hydra_dir = os.getcwd()

hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>")
datamodule: pl.LightningDataModule = hydra.utils.instantiate(
    cfg.data.datamodule, _recursive_=False
)
datamodule.setup()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from imutils.ml.utils.toolbox.nn.functional import sequence2np

from imutils.ml.utils.toolbox.nn.loss import CBCrossEntropyLoss



targets = datamodule.train_dataset.df.y

loss_func = CBCrossEntropyLoss(targets=targets,
                               beta=0.0,
                               reduction="mean")

assert np.all(loss_func.weights.numpy()==1.0)

targets = datamodule.train_dataset.df.y

loss_func = CBCrossEntropyLoss(targets=targets,
                               beta=0.99,
                               reduction="mean")

# np.all(loss_func.weights.numpy()==1.0)

w_max = loss_func.weights.numpy().max()
w_min = loss_func.weights.numpy().min()

w_sum = loss_func.weights.numpy().sum()
w_count = len(loss_func.weights)

print(f"w_max: {w_max:.5f}","\n",
      f"w_min: {w_min:.5f}","\n",
      f"w_sum: {w_sum:.5f}","\n",
      f"w_count: {w_count}")

w = loss_func.weights
w_max = loss_func.weights.numpy().max()
w_min = loss_func.weights.numpy().min()
w_normalized = (w - w_min) / (w_max - w_min)



for name, w in [("normalized weights", w_normalized), ("weights", loss_func.weights)]:

    w_max = w.numpy().max()
    w_min = w.numpy().min()

    w_sum = w.numpy().sum()
    w_count = len(w)

    print(f"{name}:", "\n", "="*10)
    print(f"w_max: {w_max:.5f}","\n",
          f"w_min: {w_min:.5f}","\n",
          f"w_sum: {w_sum:.5f}","\n",
          f"w_count: {w_count}", "\n")

In [None]:
loss_func.classes
reindex = np.argsort(loss_func.class_counts)[::-1]
reindex

In [None]:
ordered_class_counts = loss_func.class_counts[reindex]
ordered_class_weights = loss_func.weights.numpy()[reindex]


In [None]:
fig, ax = plt.subplots(2,1, figsize=(10,5), sharex=True, sharey=False)
ax[0].plot(ordered_class_counts/np.sum(ordered_class_counts))
ax[1].plot(np.exp(ordered_class_weights))

dir(loss_func)

In [None]:
weights = torch.tensor([9.8, 68.0, 5.3, 3.5, 10.8, 1.1, 1.4], dtype=torch.float32)
weights = weights / weights.sum()
print(weights)
weights = 1.0 / weights
weights = weights / weights.sum()
print(weights)

In [None]:
y = datamodule.train_dataset.df.y

counts_df = y.value_counts()

In [None]:
# y = sequence2np(y)
# classes, class_counts = np.unique(y, return_counts=True)
# print(type(classes), type(class_counts))
# for label in range(15000):
#     assert class_counts[label] == counts_df[label]
# y = y.values.to_numpy()
# y.shape
# import numpy as np

In [None]:
# logging.warning("1. Before model, before trainer")
hydra.utils.log.info(f"Instantiating <{cfg.model_cfg._target_}>")
# model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg, _recursive_=False)
# model = imutils.ml.models.pl.classifier.LitClassifier(cfg=cfg, #model_cfg=cfg.model_cfg,
#                                                       loss_func=cfg.model_cfg.loss)

# logging.warning("2. After model, before trainer")
# ic(torch.cuda.current_device())	
# ic(torch.cuda.get_device_name(0))
wandb_logger = configure_loggers(cfg=cfg, model=model)
callbacks: List[pl.Callback] = configure_callbacks(cfg=cfg.train)	
hydra.utils.log.info(f"Instantiating the Trainer")
pp(OmegaConf.to_container(cfg.train.pl_trainer))
trainer = configure_trainer(cfg,
                            callbacks=callbacks,
                            logger=wandb_logger)
# logging.warning("3. After model, after trainer, before fit")
# ic(torch.cuda.current_device())
num_samples = len(datamodule.train_dataset)
num_classes = cfg.model_cfg.head.num_classes
batch_size = datamodule.batch_size #["train"]
hydra.utils.log.info("Starting training with {} classes across {} images in batches of {} images each.".format(
    num_classes,
    num_samples,
    batch_size))



In [None]:
datamodule.test_dataset.test_transform = None

In [None]:
datamodule.test_dataset[0][0].shape

In [None]:
from tqdm import tqdm

loader = datamodule.test_dataloader()
for i, batch in tqdm(enumerate(iter(loader)), total = len(loader)):
    if i > 50:
        break

In [None]:
128*2.21

In [None]:
64*

In [None]:
from PIL import Image
img = Image.open(datamodule.test_dataset.df.path.iloc[0])
img.size

img

In [None]:
import meerkat as mk
# from meerkat.contrib.imagenette import download_imagenette


# dp_csv_path = datamodule.train_dataset.split_file_path
dp_csv_path = datamodule.test_dataset.split_file_path

# download_imagenette(".")
dp = mk.DataPanel.from_csv(dp_csv_path)
dp["img"] = mk.ImageColumn.from_filepaths(dp["path"],
                                          transform=datamodule.test_transform)
# dp[["scientificName", "image_id", "img"]].lz[:3]
dp[["image_id", "img"]].lz[:3]

In [None]:
dp["img"][3].shape

In [None]:
import numpy as np
blue_col = dp.map(
    lambda x: np.array(x["img"])[2, :, :].mean(), 
    # lambda x: np.array(x["img"])[:, :, 2].mean(), 
    pbar=True, 
    num_workers=2
)
dp["avg_blue"] = blue_col  # add the intensities as a new column in the `DataPanel` 

dp

In [None]:
type(dp["img"][0])

# dir(dp["img"].data)
dp["img"].data

In [None]:
# 1. Define the forward hook
class ActivationExtractor:
    """Extracting activations a targetted intermediate layer"""

    def __init__(self):
        self.activation = None

    def forward_hook(self, module, input, output):
        self.activation = output

# model.setup()
# 2. Register the forward hook
extractor = ActivationExtractor()
model.net.backbone.layer4.register_forward_hook(extractor.forward_hook);

In [None]:
import torch

import meerkat.ml as mkml
# 1. Move the model to GPU
model.to(0).eval()

# 2. Define a function that runs a forward pass over a batch 
@torch.no_grad()
def predict(batch: mk.DataPanel):
    input_col: mk.TensorColumn = batch["img"] 
    x: torch.Tensor = input_col.data.to(0)  # We get the underlying torch tensor with `data` and move to GPU 
    out: torch.Tensor = model(x)  # Run forward pass

    # Return a dictionary with one key for each of the new columns. Each value in the
    # dictionary should have the same length as the batch. 
    return {
        "pred": out.cpu().numpy().argmax(axis=-1),
        "probs": torch.softmax(out, axis=-1).cpu(),
        "activation": mkml.EmbeddingColumn(extractor.activation.mean(dim=[-1,-2]).cpu())
    }



valid_dp = dp.lz[:100]
# 3. Apply the update. Note that the `predict` function operates on batches, so we set 
# `is_batched_fn=True`. Also, the `predict` function only accesses the "input" column, by 
# specifying that here we instruct update to only load that one column and skip others 
valid_dp = valid_dp.update(
    function=predict,
    is_batched_fn=True,
    batch_size=32,
    input_columns=["img"], 
    pbar=True
)



In [None]:
!conda list

# dp["img"]
updated_dp

pp(OmegaConf.to_container(model.cfg, resolve=True))

In [None]:
# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=10-val_loss=2.834-val_F1=0.384.ckpt"

# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=14-val_loss=2.521-val_F1=0.443.ckpt"

ckpt_dir = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts"


model = model.load_from_checkpoint(ckpt_path);
model.eval();
model.freeze();


results = []

for batch in iter(test_dataloader):
    x, y, metadata = batch
    x.to(model.device)
    image_ids = metadata['image_id']
    
    y_logits = model.predict_step(x)
    
    results.append({"image_id": image_ids,
                    "y_logits: y_logits"})

pp(OmegaConf.to_container(model.cfg, resolve=True))

# dir()

results = {}

results['val'] = trainer.validate(model, datamodule=datamodule)

pp(results['val'])

# datamodule.test_dataset.df
# datamodule.train_dataset.df
import numpy as np
import torch

# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=07-val_loss=3.338-val_F1=0.313.ckpt"

ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=10-val_loss=2.834-val_F1=0.384.ckpt"

# ckpt = torch.load(ckpt_path)
# print(ckpt.keys())

model = model.load_from_checkpoint(ckpt_path);
model.eval();
model.freeze();
# model.to("cpu")

# test_loader = datamodule.test_dataloader()
# batch = next(iter(test_loader))

# batch[1]#.cpu()

# batch[:2] = (b.cuda() for b in batch[:2])
# [b.cuda() for b in batch[:2]]

# batch[0] = batch[0].cuda()
# batch[1] = batch[1].cuda()
# x, y, metadata = batch
# x = x.to('cuda')
# y_logits = model(x)
# y_logits_top5 = torch.topk(torch.Tensor(y_logits.cpu()), k=5, dim=1)
# topk = 5
# y_logits_top5_idx = y_logits_top5.indices.numpy()
# labels_k = le.decode_topk(y_logits_top5_idx)
# datamodule.train_dataset.label_encoder.classes_
# y_pred = torch.zeros_like(y_logits_top5.indices)
# topk_labels = np.empty((128,5), dtype="O")
# for k in range(5):
#     labels_k = datamodule.train_dataset.label_encoder.inverse_transform(y_logits_top5.indices[:,k])
#     topk_labels[:,k] = labels_k

In [None]:
# logging.warning("1. Before model, before trainer")
hydra.utils.log.info(f"Instantiating <{cfg.model_cfg._target_}>")
# model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg, _recursive_=False)
# model = imutils.ml.models.pl.classifier.LitClassifier(cfg=cfg, #model_cfg=cfg.model_cfg,
#                                                       loss_func=cfg.model_cfg.loss)

# logging.warning("2. After model, before trainer")
# ic(torch.cuda.current_device())	
# ic(torch.cuda.get_device_name(0))
wandb_logger = configure_loggers(cfg=cfg, model=model)
callbacks: List[pl.Callback] = configure_callbacks(cfg=cfg.train)	
hydra.utils.log.info(f"Instantiating the Trainer")
pp(OmegaConf.to_container(cfg.train.pl_trainer))
trainer = configure_trainer(cfg,
                            callbacks=callbacks,
                            logger=wandb_logger)
# logging.warning("3. After model, after trainer, before fit")
# ic(torch.cuda.current_device())
num_samples = len(datamodule.train_dataset)
num_classes = cfg.model_cfg.head.num_classes
batch_size = datamodule.batch_size #["train"]
hydra.utils.log.info("Starting training with {} classes across {} images in batches of {} images each.".format(
    num_classes,
    num_samples,
    batch_size))



In [None]:
trainer.fit(model=model, datamodule=datamodule)

In [None]:
from imutils.ml.utils.model_utils import log_model_summary

model = imutils.ml.models.pl.classifier.LitClassifier(cfg=cfg, #model_cfg=cfg.model_cfg,
                                              loss=cfg.model_cfg.loss)

print(OmegaConf.to_yaml(cfg.model_cfg, resolve=True, sort_keys=True))

# print(OmegaConf.to_yaml(cfg, resolve=True, sort_keys=True))
# print(OmegaConf.to_yaml(cfg, resolve=True, sort_keys=True))
# print(cfg['data']['datamodule']['transform_cfg'])

type(cfg.model_cfg.input_shape)
type(OmegaConf.to_container(cfg.model_cfg.input_shape, resolve=True)[0])

In [None]:
# Instantiate model
# hydra.utils.log.info(f"Instantiating <{cfg.model_cfg._target_}>")
# model: pl.LightningModule = hydra.utils.instantiate(model_cfg=cfg, _recursive_=False)


from imutils.ml.utils.model_utils import log_model_summary

model = imutils.ml.models.pl.classifier.LitClassifier(cfg=cfg, #model_cfg=cfg.model_cfg,
                                              loss=cfg.model_cfg.loss)

print(OmegaConf.to_yaml(cfg.model_cfg, resolve=True, sort_keys=True))

# input_size = (1, 3,224,224)
# summary = log_model_summary(model=model,
#                             input_size=input_size,
#                             full_summary=True,
#                             working_dir=".",
#                             model_name=cfg.model_cfg.backbone.name,
#                             verbose=1)

from imutils.ml.utils.experiment_utils import configure_callbacks, configure_loggers, configure_trainer

# Instantiate the callbacks
callbacks: List[pl.Callback] = configure_callbacks(cfg=cfg.train) #OmegaConf.to_container(cfg.train, resolve=True))

model.freeze_up_to(layer=-1,
                  submodule="backbone")

# ([(n, p.requires_grad_(False)) for n, p in model.net.backbone.named_parameters()])
pp([(n, p.requires_grad) for n, p in model.net.backbone.named_parameters()])

In [None]:
# datamodule.test_dataset.df
# datamodule.train_dataset.df
import numpy as np
import torch

# ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=07-val_loss=3.338-val_F1=0.313.ckpt"

ckpt_path = "/media/data/jacob/GitHub/image-utils/imutils/ml/hydra_experiments/2022-03-24/07-35-17/hydra_experiments/2022-03-24/07-35-17/ckpts/epoch=10-val_loss=2.834-val_F1=0.384.ckpt"

# ckpt = torch.load(ckpt_path)
# print(ckpt.keys())

model = model.load_from_checkpoint(ckpt_path);
model.eval();
model.freeze();
# model.to("cpu")

# test_loader = datamodule.test_dataloader()
# batch = next(iter(test_loader))

# batch[1]#.cpu()

In [None]:
# batch[:2] = (b.cuda() for b in batch[:2])
# [b.cuda() for b in batch[:2]]

# batch[0] = batch[0].cuda()
# batch[1] = batch[1].cuda()
# x, y, metadata = batch
# x = x.to('cuda')
# y_logits = model(x)
# y_logits_top5 = torch.topk(torch.Tensor(y_logits.cpu()), k=5, dim=1)
# topk = 5
# y_logits_top5_idx = y_logits_top5.indices.numpy()
# labels_k = le.decode_topk(y_logits_top5_idx)
# datamodule.train_dataset.label_encoder.classes_
# y_pred = torch.zeros_like(y_logits_top5.indices)
# topk_labels = np.empty((128,5), dtype="O")
# for k in range(5):
#     labels_k = datamodule.train_dataset.label_encoder.inverse_transform(y_logits_top5.indices[:,k])
#     topk_labels[:,k] = labels_k

#### Dev topk predictions

In [None]:
# import numpy as np
# topk_labels = [] #np.empty((128,5), dtype="string")

# topk = 5
# y_logits_top5_idx = y_logits_top5.indices.numpy()

# y = y_logits_top5_idx#.shape[1]

# if isinstance(y, np.ndarray):
#     if y.ndim == 2:
#         topk = y.shape[1]
#     else:
#         topk = 1
# if isinstance(y, list):
#     if isinstance(y[0], np.ndarray):
#         topk = y[0].shape[0]
#     elif isinstance(y[0], list):
#         topk = len(y[0])

# for k in range(topk):
#     # labels_k = datamodule.train_dataset.label_encoder.inverse_transform(y_logits_top5.indices[:,k])
#     labels_k = le.decode(y[:,k])
#     topk_labels.append(labels_k)

In [None]:
# topk_labels = np.vstack(topk_labels).T
# topk_labels.shape

# topk_labels[:2,:]
# true_labels = le.decode(y.numpy())
# topk_labels[0]

# from sklearn import preprocessing
# encoder = preprocessing.LabelEncoder()
# encoder.fit([0,4,-2,6])
# encoder.classes_

# class_list = getattr(encoder, "classes_", [])
# class2idx = {label: idx for idx, label in enumerate(class_list)}
# print(class2idx)

In [None]:
from imutils.ml.utils.label_utils import LabelEncoder

le = LabelEncoder.from_sklearn(datamodule.train_dataset.label_encoder)
le

In [None]:
from typing import *
from imutils.ml.utils.model_utils import log_model_summary

log_model_summary(model, input_size=(2,3,224,224))

ckpt['epoch']
ckpt['global_step']
ckpt['pytorch-lightning_version']
ckpt['hparams_name']
ckpt['hyper_parameters']

In [None]:

datamodule.test_dataset.y_col
getattr(datamodule.test_dataset.df, "y", -1)
# df = datamodule.train_dataset.df
df = pd.concat([datamodule.train_dataset.df,
                datamodule.val_dataset.df])
class_counts = df.value_counts("y")
class_counts = class_counts.reset_index(drop=False).rename(columns={0:"counts"})
num_classes = class_counts.shape[0]
above_25 = class_counts[class_counts.counts>=25]
above_10 = class_counts[class_counts.counts>=10]
above_5 = class_counts[class_counts.counts>=5]

below_5 = class_counts[class_counts.counts<5]
below_10 = class_counts[class_counts.counts<10]
below_25 = class_counts[class_counts.counts<25]

print("Min: ", min(class_counts.counts),"Max:", max(class_counts.counts))
pp(f"above_25: {above_25.shape[0]}, {above_25.shape[0] / num_classes:.4%}")
pp(f"above_10: {above_10.shape[0]}, {above_10.shape[0] / num_classes:.4%}")
pp(f"above_5: {above_5.shape[0]}, {above_5.shape[0] / num_classes:.4%}")

# pp(above_10.shape[0], above_10.shape[0] / num_classes)
# pp(above_5.shape[0], above_5.shape[0] / num_classes)

pp(f"below_5: {below_5.shape[0]}, {below_5.shape[0] / num_classes:.4%}")
pp(f"below_10: {below_10.shape[0]}, {below_10.shape[0] / num_classes:.4%}")
pp(f"below_25: {below_25.shape[0]}, {below_25.shape[0] / num_classes:.4%}")
# pp(below_10.shape[0], below_10.shape[0] / num_classes)
# pp(below_25.shape[0], below_25.shape[0] / num_classes)

In [None]:
def filter_catalog(catalog: pd.DataFrame,
                   column: str,
                   include=None,
                   exclude=None) -> pd.DataFrame:
    num_rows = catalog.shape[0]
    if isinstance(include, Sequence):
        pp(f"Including {len(include)}")
        catalog = catalog[catalog[column].apply(lambda x: x in include)]
    if isinstance(exclude, Sequence):
        pp(f"Excluding {len(exclude)}")
        
        catalog = catalog[catalog[column].apply(lambda x: x not in exclude)]

    pp(f"Input num_rows: {num_rows}")
    pp(f"Filtered num_rows: {catalog.shape[0]}, {catalog.shape[0]/num_rows:.3%}")
        
    return catalog

df = pd.concat([datamodule.train_dataset.df,
                datamodule.val_dataset.df])
                # datamodule.test_dataset.df])
class_counts = df.value_counts("y")
class_counts = class_counts.reset_index(drop=False).rename(columns={0:"counts"})
num_classes = class_counts.shape[0]
# above_20 = class_counts[class_counts.counts>=20].counts
# above_20 = above_20.values.tolist()
above_20 = None
below_20 = class_counts[class_counts.counts<40].counts
below_20 = below_20.values.tolist()
above_20_catalog = filter_catalog(catalog=df,
                                  column="y",
                                  include=above_20,
                                  exclude=below_20)
above_20_catalog

In [None]:
class_counts.describe()

# above_20 = class_counts[class_counts.counts>=20].counts
# above_20 = above_20.values.tolist()
# above_20 = None
below_20 = class_counts[class_counts.counts<40] # .counts

below_20.describe()

datamodule.val_dataset.df.value_counts("y")

dir(datamodule.train_dataset.df.groupby("y"))



train --> (num_samples: 587,840), (num_batches: 9,185)
train --> (num_samples: 587,840), (num_batches: 9,185)
val --> (num_samples: 251,932), (num_batches: 3,937)
val --> (num_samples: 251,932), (num_batches: 3,937)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


In [None]:
# val_loader = datamodule.val_dataloader()

# batch = next(iter(val_loader))

# len(batch)

# import torch
# from torch import nn
# # Single-label categorical
# x = torch.randn(10, 5)
# y = torch.randint(5, (10,))
# loss = nn.CrossEntropyLoss()(x, y)


# print(x.shape, y.shape, loss.shape)
# print(x.dtype, y.dtype, loss.dtype)

# # model

# num_samples = len(datamodule.train_dataset)
# num_batches = len(datamodule.train_dataloader())

# ic(num_samples, num_batches, num_samples/ num_batches)

# import numpy as np

# print(f"{2264842/2:,}")

# print(f"{2264842/3:,}")

# ic(num_samples * np.arange(5))

In [None]:
# datamodule.setup_transforms(datamodule.transform_cfg)
# from imutils.ml.data.datamodule import get_default_transforms
# a = get_default_transforms(mode="train", config=datamodule.transform_cfg)

In [None]:
import wandb
def configure_loggers(cfg):

    wandb_logger = None
    if "wandb" in cfg.logging:
        hydra.utils.log.info(f"Instantiating <WandbLogger>")
        wandb_config = cfg.logging.wandb
        wandb_logger = pl.loggers.WandbLogger(
            name=wandb_config
            .get("name", 
                           (cfg.data.datamodule.get("name") + "__" + cfg.model_cfg.name)),
            project=wandb_config.project,
            entity=wandb_config.entity,
            tags=cfg.core.tags,
            log_model=True,
        )
        hydra.utils.log.info(f"W&B is now watching <{wandb_config.watch.log}>!")
        wandb_logger.watch(
            model, log=wandb_config.watch.log, log_freq=wandb_config.watch.log_freq
        )


        hydra.utils.log.info(f"Instantiating the Trainer")
    return wandb_logger

In [None]:
# wandb_logger = configure_loggers(cfg=cfg, model=model)
# print(wandb_logger)

# # The Lightning core, the Trainer
# trainer = pl.Trainer(
#     default_root_dir=cfg.hydra.run.dir,
#     logger=wandb_logger,
#     callbacks=callbacks,
#     deterministic=cfg.train.deterministic,
#     val_check_interval=cfg.logging.val_check_interval,
#     log_every_n_steps=10,
#     #auto_select_gpus=True,
#     # benchmark=True,
#     # accelerator=None,  # 'dp', "ddp" if args.gpus > 1 else None,
#     #plugins=[DDPPlugin(find_unused_parameters=True)],
#     **cfg.train.pl_trainer,
# )
# # num_samples = len(datamodule.train_dataset)
# num_classes = cfg.model_cfg.head.num_classes
# batch_size = datamodule.batch_size #["train"]
# hydra.utils.log.info("Starting training with {} classes and batches of {} images".format(
#     num_classes,
#     batch_size))
# # pp(OmegaConf.to_container(cfg.train.callbacks, resolve=True))
# trainer.fit(model=model, datamodule=datamodule)
# #%debug
# hydra.utils.log.info(f"Starting testing!")
# trainer.test(model=model, datamodule=datamodule)
# shutil.copytree(".hydra", Path(wandb_logger.experiment.dir) / "hydra")

## Measure time and function wrapping

In [None]:
def measure_time(func):
    def wrapper(*args, **kwargs):
        from time import time
        start = time()
        result = func(*args, **kwargs)
        total_time = time() - start
        print(f'Elapsed time is {total_time} ms')
        
        if isinstance(result, int):
            print(f"{result}/{total_time} = {result/(total_time):.3f} samples/sec")
        return result
    return wrapper

In [None]:
# from tqdm import tqdm

@measure_time
def loop_through_dataloader(dataloader, num_batches):
    data_iter = iter(dataloader)
    for i, batch in tqdm(enumerate(data_iter), total=num_batches):
        if i > num_batches-1:
            break
    num_samples = i*len(batch[0])
    return num_samples



# train_iter = iter(datamodule.train_dataloader())
# datamodule.setup()
# dataloader = datamodule.train_dataloader()
# num_batches = 40
# loop_through_dataloader(dataloader, num_batches)
# bb = next(iter(dataloader))
# pp(datamodule.cfg)
# pp(OmegaConf.to_container(datamodule.cfg))

In [None]:
type(datamodule.cfg)

In [None]:

# default_cfg = DictConfig(dict(
#     catalog_dir=None,
#     label_col="scientificName",
#     train_size=0.7,
#     shuffle=True,
#     seed=14,
#     batch_size = 128,
#     num_workers = None,
#     pin_memory=True,
#     transform_cfg=None,
#     remove_transforms=False,
# ))

# from rich import print as pp

# pp(OmegaConf.to_yaml(default_cfg))

### Mock config yaml

In [None]:
# CATALOG_DIR = "/media/data_cifs/projects/prj_fossils/users/jacob/data/herbarium_2022/catalog"

# datamodule = Herbarium2022DataModule(catalog_dir=CATALOG_DIR,
#                                      num_workers=4,
#                                      # image_reader=read_file_binary,
#                                      remove_transforms=True)
# datamodule.setup()

# subset = "train"
# dataset = datamodule.get_dataset(subset=subset)

# dataset.num_classes

# trainer.fit(model, datamodule)

## End

In [None]:
# from torchvision import transforms as T
# import argparse
# from rich import print as pp

# args = argparse.Namespace(
#     preprocess={
#         "train":{
#             "resize":512
#         },
#         "val":{
#             "resize":256
#         },
#     },
#     batch_transform={
#         "train":{
#             "random_resize_crop":224
#         },
#         "val":{
#             "center_crop":224
#         },
#     normalize=(
#        [0.485, 0.456, 0.406],
#        [0.229, 0.224, 0.225]
#     )
#     }
# )
# pp(args)

# kornia_transform = nn.Sequential(
#     K.RandomHorizontalFlip(),
#     K.RandomVerticalFlip(),
#     K.RandomMotionBlur(3, 35., 0.5),
#     K.RandomRotation(degrees=45.0),
#     K.Normalize(mean=mean_std,std=mean_std)
# )

# import numpy as np
# from torch import nn
# import torch
# from albumentations.augmentations import transforms as AT

# to_tensor = T.ToTensor()

# class Preprocess(nn.Module):

#     def __init__(self, mode="train", resize=None):
#         super().__init__()
#         self.mode = mode
#         self.resize = resize        
#         self.resize_func = T.Resize(self.resize)
    
#     @torch.no_grad()  # disable gradients for effiency
#     def forward(self, x) -> torch.Tensor:
#         # x_tmp: np.ndarray = np.array(x)  # HxWxC
#         # x_out: Tensor = to_tensor(x_tmp, keepdim=True)  # CxHxW
#         if self.resize:
#             x = self.resize_func(x)

#         return x #_out.float()# / 255.0




# class BatchTransforms(nn.Module):
#     """Module to perform data augmentation using Kornia on torch tensors."""

#     def __init__(self,
#                  mode: str="train",
#                  random_resize_crop=None,
#                  center_crop=None,
#                  apply_color_jitter: bool = False,
#                  normalize = (
#                      [0,0,0],
#                      [1,1,1]
#                  )
#                 ) -> None:
#         super().__init__()
#         self.mode = mode
#         self.random_resize_crop = random_resize_crop
#         self.center_crop = center_crop
#         self._apply_color_jitter = apply_color_jitter
#         self.normalize = normalize
        
#         self.build_transforms(mode=mode)

        
#     def add_train_transforms(self, transforms=None):
        
#         transforms = transforms or []
#         # if mode == "train":
#         transforms.append(T.RandomPerspective())
#         if type(self.random_resize_crop) == int:
#             transforms.append(T.RandomResizedCrop(self.random_resize_crop))
#         transforms.extend([
#             T.RandomHorizontalFlip(),
#             T.RandomVerticalFlip()
#         ])
#         return transforms

#     def add_test_transforms(self, transforms=None):
        
#         transforms = transforms or []
#         if type(self.center_crop) == int:
#             transforms.append(T.CenterCrop(self.center_crop))
#         return transforms


#     def build_transforms(self,
#                          mode: str = "train"):
#         transforms = []
#         if mode == "train":
#             transforms = self.add_train_transforms(transforms=transforms)
#         elif mode in ["val", "test"]:
#             transforms = self.add_test_transforms(transforms=transforms)

#         transforms.extend([
# 			# T.ToTensor(),
# 			T.Normalize(*self.normalize)
#         ])

#         self.transforms = nn.Sequential(*transforms)
#         self.jitter = AT.ColorJitter(brightness=0.2,
#                                      contrast=0.2,
#                                      saturation=0.2,
#                                      hue=0.2,
#                                      always_apply=False,
#                                      p=0.5)

#     @torch.no_grad()  # disable gradients for effiency
#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         x_out = self.transforms(x)  # BxCxHxW
#         if self._apply_color_jitter:
#             x_out = self.jitter(x_out)
#         return x_out







# for subset in ["train","val", "test"]:
#     data = Herbarium2022Dataset(subset=subset,
#                                 label_col="scientificName",
#                                 train_size=0.7,
#                                 shuffle=(subset != "test"),
#                                 seed=14,
#                                 transform=None)

#########################################

CATALOG_DIR = "/media/data_cifs/projects/prj_fossils/users/jacob/data/herbarium_2022/catalog"
SHARD_DIR = "/media/data_cifs/projects/prj_fossils/users/jacob/data/herbarium_2022/webdataset"

# datamodule = Herbarium2022DataModule(catalog_dir=CATALOG_DIR,
#                                      batch_size=64,
#                                      num_workers=4,
#                                      image_reader=read_file_binary,
#                                      remove_transforms=True)
# datamodule.setup()


# train_dataloader = datamodule.train_dataloader()
# train_batch = next(iter(train_dataloader))
# datamodule.train_dataset.encoder.inverse_transform(train_batch[1])

# checkpoint_callback = [c for c in callbacks if isinstance(c, pl.callbacks.ModelCheckpoint)][0]
# logging.info(f"checkpoint_callback.best_model_path: {str(checkpoint_callback.best_model_path)}")
# config.system.tasks[f"task_{task_id}"].ckpt_path = checkpoint_callback.best_model_path
# checkpoint_callback.best_model_score = checkpoint_callback.best_model_score or 0.0
# logging.info(f"checkpoint_callback.best_model_score: {checkpoint_callback.best_model_score:.3f}")
# logging.info(f"[Initiating TESTING on task_{task_id}]")


# test_results = run_multitask_test(trainer=trainer,
#                               model=model,
#                               datamodule=datamodule,
#                               config=config,
#                               tasks="all")#,