# Test

## Import packages

In [None]:
import tensorflow_datasets as tfds

from tensorflow.lite.python.schema_py_generated import Model

import timm
import torch
import os
import shutil
import pandas as pd
from timm.data import create_dataset


In [None]:
os.getcwd()

### Inspecting timm

In [None]:
len(timm.list_models())

In [None]:
len(timm.list_models(pretrained=True))

#### Load data

In [None]:
# Construct a tf.data.Dataset
ds = tfds.load('mnist', split='train', as_supervised=True, shuffle_files=True)

In [None]:
os.chdir(r'C:\\Users\\marti\\Desktop\\Škola\\Diplomova prace\\Imagenette\\Imagenette2')

In [None]:

noise_level = '5' # '5' or '50'
outdir = 'noisy' + noise_level
df = pd.read_csv('noisy_imagenette.csv', sep=',')
for inpath, label, is_val in zip(df['path'], df['noisy_labels_' + noise_level], df['is_valid']):
    if is_val:
        subdir = 'val'
    else:
        subdir = 'train'
    outpath = os.path.join(outdir, subdir, label)
    os.makedirs(outpath, exist_ok=True)
    shutil.copy2(inpath, outpath)

In [None]:
from datasets import load_dataset

mnist = load_dataset("ylecun/mnist")

In [None]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
os.chdir(r'C:\\Users\\marti\\Desktop\\Škola\\Diplomova prace\\CIFAR')

In [None]:
cifar = create_dataset('torch/cifar10', 'cifar10', download=True, split='train')

# Model

In [None]:
timm.list_models('efficientnet*', pretrained=True)

In [None]:
model = timm.create_model('resnet50d', pretrained=True, num_classes=10)

In [None]:
model

In [None]:
# %%writefile train.py

import argparse
from pathlib import Path

import timm
import timm.data
import timm.loss
import timm.optim
import timm.utils
import torch
import torchmetrics
from timm.scheduler import CosineLRScheduler

from pytorch_accelerated.callbacks import SaveBestModelCallback
from pytorch_accelerated.trainer import Trainer, DEFAULT_CALLBACKS


def create_datasets(image_size, data_mean, data_std, train_path, val_path):
    train_transforms = timm.data.create_transform(
        input_size=image_size,
        is_training=True,
        mean=data_mean,
        std=data_std,
        auto_augment="rand-m7-mstd0.5-inc1",
    )

    eval_transforms = timm.data.create_transform(
        input_size=image_size, mean=data_mean, std=data_std
    )

    train_dataset = timm.data.dataset.ImageDataset(
        train_path, transform=train_transforms
    )
    eval_dataset = timm.data.dataset.ImageDataset(val_path, transform=eval_transforms)

    return train_dataset, eval_dataset


class TimmMixupTrainer(Trainer):
    def __init__(self, eval_loss_fn, mixup_args, num_classes, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_loss_fn = eval_loss_fn
        self.num_updates = None
        self.mixup_fn = timm.data.Mixup(**mixup_args)

        self.accuracy = torchmetrics.Accuracy(num_classes=num_classes)
        self.ema_accuracy = torchmetrics.Accuracy(num_classes=num_classes)
        self.ema_model = None

    def create_scheduler(self):
        return timm.scheduler.CosineLRScheduler(
            self.optimizer,
            t_initial=self.run_config.num_epochs,
            cycle_decay=0.5,
            lr_min=1e-6,
            t_in_epochs=True,
            warmup_t=3,
            warmup_lr_init=1e-4,
            cycle_limit=1,
        )

    def training_run_start(self):
        # Model EMA requires the model without a DDP wrapper and before sync batchnorm conversion
        self.ema_model = timm.utils.ModelEmaV2(
            self._accelerator.unwrap_model(self.model), decay=0.9
        )
        if self.run_config.is_distributed:
            self.model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.model)

    def train_epoch_start(self):
        super().train_epoch_start()
        self.num_updates = self.run_history.current_epoch * len(self._train_dataloader)

    def calculate_train_batch_loss(self, batch):
        xb, yb = batch
        mixup_xb, mixup_yb = self.mixup_fn(xb, yb)
        return super().calculate_train_batch_loss((mixup_xb, mixup_yb))

    def train_epoch_end(
        self,
    ):
        self.ema_model.update(self.model)
        self.ema_model.eval()

        if hasattr(self.optimizer, "sync_lookahead"):
            self.optimizer.sync_lookahead()

    def scheduler_step(self):
        self.num_updates += 1
        if self.scheduler is not None:
            self.scheduler.step_update(num_updates=self.num_updates)

    def calculate_eval_batch_loss(self, batch):
        with torch.no_grad():
            xb, yb = batch
            outputs = self.model(xb)
            val_loss = self.eval_loss_fn(outputs, yb)
            self.accuracy.update(outputs.argmax(-1), yb)

            ema_model_preds = self.ema_model.module(xb).argmax(-1)
            self.ema_accuracy.update(ema_model_preds, yb)

        return {"loss": val_loss, "model_outputs": outputs, "batch_size": xb.size(0)}

    def eval_epoch_end(self):
        super().eval_epoch_end()

        if self.scheduler is not None:
            self.scheduler.step(self.run_history.current_epoch + 1)

        self.run_history.update_metric("accuracy", self.accuracy.compute().cpu())
        self.run_history.update_metric(
            "ema_model_accuracy", self.ema_accuracy.compute().cpu()
        )
        self.accuracy.reset()
        self.ema_accuracy.reset()


def main(data_path):

    # Set training arguments, hardcoded here for clarity
    image_size = (224, 224)
    lr = 5e-3
    smoothing = 0.1
    mixup = 0.2
    cutmix = 1.0
    batch_size = 32
    bce_target_thresh = 0.2
    num_epochs = 40

    data_path = Path(data_path)
    train_path = data_path / "train"
    val_path = data_path / "val"
    num_classes = len(list(train_path.iterdir()))

    mixup_args = dict(
        mixup_alpha=mixup,
        cutmix_alpha=cutmix,
        label_smoothing=smoothing,
        num_classes=num_classes,
    )

    # Create model using timm
    model = timm.create_model(
        "resnet50d", pretrained=False, num_classes=num_classes, drop_path_rate=0.05
    )

    # Load data config associated with the model to use in data augmentation pipeline
    data_config = timm.data.resolve_data_config({}, model=model, verbose=True)
    data_mean = data_config["mean"]
    data_std = data_config["std"]

    # Create training and validation datasets
    train_dataset, eval_dataset = create_datasets(
        train_path=train_path,
        val_path=val_path,
        image_size=image_size,
        data_mean=data_mean,
        data_std=data_std,
    )

    # Create optimizer
    optimizer = timm.optim.create_optimizer_v2(
        model, opt="lookahead_AdamW", lr=lr, weight_decay=0.01
    )

    # As we are using Mixup, we can use BCE during training and CE for evaluation
    train_loss_fn = timm.loss.BinaryCrossEntropy(
        target_threshold=bce_target_thresh, smoothing=smoothing
    )
    validate_loss_fn = torch.nn.CrossEntropyLoss()

    # Create trainer and start training
    trainer = TimmMixupTrainer(
        model=model,
        optimizer=optimizer,
        loss_func=train_loss_fn,
        eval_loss_fn=validate_loss_fn,
        mixup_args=mixup_args,
        num_classes=num_classes,
        callbacks=[
            *DEFAULT_CALLBACKS,
            SaveBestModelCallback(watch_metric="accuracy", greater_is_better=True),
        ],
    )

    trainer.train(
        per_device_batch_size=batch_size,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        num_epochs=num_epochs,
        create_scheduler_fn=trainer.create_scheduler,
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Simple example of training script using timm.")
    parser.add_argument("--data_dir", required=True, help="The data folder on disk.")
    args = parser.parse_args()
    main(args.data_dir)

In [None]:
print(torch.cuda.is_available())

In [None]:
print(torch.__version__)

In [None]:
import os
print(os.path.exists('C:\\Users\\marti\\Desktop\\Škola\\Diplomova prace\\Imagenette\imagenette2'))

In [None]:
import wandb
import random

In [None]:
wandb.require("core")

In [None]:
wandb.login()

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="my-awesome-project",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.02,
    "architecture": "CNN",
    "dataset": "CIFAR-100",
    "epochs": 10,
    }
)

# simulate training
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2 ** -epoch - random.random() / epoch - offset
    loss = 2 ** -epoch + random.random() / epoch + offset

    # log metrics to wandb
    wandb.log({"acc": acc, "loss": loss})

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.memory_summary())

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
print(torch.cuda.list_gpu_processes())

In [None]:
print(torch.cuda.mem_get_info())

In [None]:
print(torch.cuda.memory_stats())

In [None]:
print(torch.cuda.memory_snapshot())

In [None]:
print(torch.cuda.memory_allocated())

In [None]:
print(torch.cuda.max_memory_allocated())

In [None]:
print(torch.cuda.memory_reserved())

In [None]:
torch.cuda.reset_peak_memory_stats()

In [None]:
import wandb
import yaml
from train import main

# Load the sweeper.yaml configuration from the YAML file
with open('sweep_config.yaml', 'r') as file:
    sweep_config = yaml.safe_load(file)

# Initialize the sweeper.yaml by passing the loaded config and project name
sweep_id = wandb.sweep(sweep=sweep_config)

# Start the sweeper.yaml agent, passing the main function
# The count parameter specifies how many runs you want
wandb.agent(sweep_id)

In [None]:
wandb.sweep(sweep=sweep_config)

In [None]:
wandb.agent(sweep_id="rgcjpcod", count=1)

In [8]:
import wandb
import pandas as pd
import numpy as np

# Initialize the WandB API
api = wandb.Api()

# Specify the project
project_path = "martinhruska_2000/Comparison200"

# Get all runs in the project
runs = api.runs(project_path)

<Runs martinhruska_2000/Comparison200>


In [6]:
# Initialize empty list to store data
data = []

# Loop over each run to retrieve metrics
for run in runs:
    # Get initialization strategy
    initialization = run.config.get("initialization", "unknown")

    # Extract relevant metrics, ensuring numeric values only
    eval_loss = pd.to_numeric(run.history(keys=["eval_loss"])["eval_loss"], errors="coerce").dropna()
    eval_top1 = pd.to_numeric(run.history(keys=["eval_top1"])["eval_top1"], errors="coerce").dropna()
    eval_top5 = pd.to_numeric(run.history(keys=["eval_top5"])["eval_top5"], errors="coerce").dropna()
    # grad_first1 = pd.to_numeric(run.history(keys=["grad_first1_l2"])["grad_first1_l2"], errors="coerce").dropna()
    # grad_first2 = pd.to_numeric(run.history(keys=["grad_first2_l2"])["grad_first2_l2"], errors="coerce").dropna()
    # grad_first3 = pd.to_numeric(run.history(keys=["grad_first3_l2"])["grad_first3_l2"], errors="coerce").dropna()
    # grad_first4 = pd.to_numeric(run.history(keys=["grad_first4_l2"])["grad_first4_l2"], errors="coerce").dropna()
    # grad_first5 = pd.to_numeric(run.history(keys=["grad_first5_l2"])["grad_first5_l2"], errors="coerce").dropna()
    # grad_last1 = pd.to_numeric(run.history(keys=["grad_last1_l2"])["grad_last1_l2"], errors="coerce").dropna()
    # grad_last2 = pd.to_numeric(run.history(keys=["grad_last2_l2"])["grad_last2_l2"], errors="coerce").dropna()
    # grad_last3 = pd.to_numeric(run.history(keys=["grad_last3_l2"])["grad_last3_l2"], errors="coerce").dropna()
    # grad_last4 = pd.to_numeric(run.history(keys=["grad_last4_l2"])["grad_last4_l2"], errors="coerce").dropna()
    # grad_last5 = pd.to_numeric(run.history(keys=["grad_last5_l2"])["grad_last5_l2"], errors="coerce").dropna()

    # Append metrics to data if all lists have values
    for loss, top1, top5, g_first1, g_first2, g_first3, g_first4, g_first5, g_last1, g_last2, g_last3, g_last4, g_last5 in zip(
        eval_loss, eval_top1, eval_top5, grad_first1, grad_first2, grad_first3, grad_first4, grad_first5, grad_last1, grad_last2, grad_last3, grad_last4, grad_last5
    ):
        data.append({
            "initialization": initialization,
            "eval_loss": loss,
            "eval_top1": top1,
            "eval_top5": top5,
            "grad_first1_l2": g_first1,
            "grad_first2_l2": g_first2,
            "grad_first3_l2": g_first3,
            "grad_first4_l2": g_first4,
            "grad_first5_l2": g_first5,
            "grad_last1_l2": g_last1,
            "grad_last2_l2": g_last2,
            "grad_last3_l2": g_last3,
            "grad_last4_l2": g_last4,
            "grad_last5_l2": g_last5,
        })

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Group by initialization and calculate summary statistics for each metric
summary_stats = df.groupby("initialization").agg(["mean", "median", "std"])

# Display the summary statistics
print(summary_stats)

KeyError: 'grad_first1_l2'

In [19]:
# Check the list of logged metrics for the first run
if runs:
    run = runs[0]
    # Get all keys (metrics) logged to this run
    history = run.history()  # Returns a DataFrame with all metrics logged
    metrics = history.columns  # List of all metric names

    # Display the metric names
    print("Logged Metrics:")
    for metric in metrics:
        print(metric)

    # Optional: Display sample data for each metric
    print("\nSample Data:")
    print(history.head())  # Show the first few rows of data
else:
    print("No runs found in the project.")

Logged Metrics:
train_grad_classifier_last_l2
lr
eval_top1
grad_bn1_first_l2
eval_top5
epoch
train_loss
train_grad_bn1_first_l2
_runtime
_step
eval_loss
grad_classifier_last_l2
train_grad_conv_stem_first_l2
_timestamp
grad_conv_stem_first_l2

Sample Data:
   train_grad_classifier_last_l2  lr  eval_top1  grad_bn1_first_l2  eval_top5  \
0                            NaN NaN        NaN                NaN        NaN   
1                            NaN NaN        NaN        4485.817871        NaN   
2                            NaN NaN        NaN                NaN        NaN   
3                            NaN NaN        NaN                NaN        NaN   
4                            NaN NaN        NaN                NaN        NaN   

   epoch  train_loss  train_grad_bn1_first_l2    _runtime  _step eval_loss  \
0      0         NaN                      NaN  144.823756      0      None   
1      0         NaN                      NaN  144.824754      1      None   
2      0         NaN   

In [20]:
# Initialize empty list to store data
data = []

# Loop over each run to retrieve metrics
for run in runs:
    # Get initialization strategy
    initialization = run.config.get("initialization", "unknown")

    # Extract relevant metrics, ensuring numeric values only
    eval_loss = pd.to_numeric(run.history(keys=["eval_loss"])["eval_loss"], errors="coerce").dropna()
    eval_top1 = pd.to_numeric(run.history(keys=["eval_top1"])["eval_top1"], errors="coerce").dropna()
    eval_top5 = pd.to_numeric(run.history(keys=["eval_top5"])["eval_top5"], errors="coerce").dropna()
    grad_classifier_last_l2 = pd.to_numeric(run.history(keys=["grad_classifier_last_l2"])["grad_classifier_last_l2"], errors="coerce").dropna()
    grad_bn1_first_l2 = pd.to_numeric(run.history(keys=["grad_bn1_first_l2"])["grad_bn1_first_l2"], errors="coerce").dropna()
    grad_conv_stem_first_l2 = pd.to_numeric(run.history(keys=["grad_conv_stem_first_l2"])["grad_conv_stem_first_l2"], errors="coerce").dropna()
    # grad_first4 = pd.to_numeric(run.history(keys=["train_grad_first4_l2"])["train_grad_first4_l2"], errors="coerce").dropna()
    # grad_first5 = pd.to_numeric(run.history(keys=["train_grad_first5_l2"])["train_grad_first5_l2"], errors="coerce").dropna()
    # grad_last1 = pd.to_numeric(run.history(keys=["train_grad_last1_l2"])["train_grad_last1_l2"], errors="coerce").dropna()
    # grad_last2 = pd.to_numeric(run.history(keys=["train_grad_last2_l2"])["train_grad_last2_l2"], errors="coerce").dropna()
    # grad_last3 = pd.to_numeric(run.history(keys=["train_grad_last3_l2"])["train_grad_last3_l2"], errors="coerce").dropna()
    # grad_last4 = pd.to_numeric(run.history(keys=["train_grad_last4_l2"])["train_grad_last4_l2"], errors="coerce").dropna()
    # grad_last5 = pd.to_numeric(run.history(keys=["train_grad_last5_l2"])["train_grad_last5_l2"], errors="coerce").dropna()

    # Append metrics to data if all lists have values
    for loss, top1, top5, g_classifier_last_l2, g_bn1_first_l2, g_conv_stem_first_l2 in zip(
        eval_loss, eval_top1, eval_top5, grad_classifier_last_l2, grad_bn1_first_l2, grad_conv_stem_first_l2
    ):
        data.append({
            "initialization": initialization,
            "eval_loss": loss,
            "eval_top1": top1,
            "eval_top5": top5,
            "grad_classifier_last_l2": g_classifier_last_l2,
            "grad_bn1_first_l2": g_bn1_first_l2,
            "grad_conv_stem_first_l2": g_conv_stem_first_l2,
        })

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Group by initialization and calculate summary statistics for each metric
summary_stats = df.groupby("initialization").agg(["mean", "median", "std"])

# Display the summary statistics
print(summary_stats)

               eval_loss                      eval_top1                    \
                    mean    median       std       mean median        std   
initialization                                                              
goog            0.676778  0.551158  0.404998  80.405718  84.26  12.726949   
he              0.845768  0.680184  0.541014  74.031098  79.58  15.589236   
normal          0.693216  0.552274  0.419367  78.517342  82.32  13.596558   
uniform         0.744234  0.601943  0.390350  78.482061  82.55  12.518836   
xavier          0.759582  0.605189  0.471669  76.499155  81.32  15.041674   

                eval_top5                  grad_classifier_last_l2  \
                     mean median       std                    mean   
initialization                                                       
goog            97.969295  99.02  4.397228              552.024513   
he              96.646253  98.47  6.308088              827.003985   
normal          97.691718  98.94 

Vidíme, že ve všech základních metrikách je nejlepší inicializace goog (neboli ta základní z knihovny timm). He inicializace je naopak nejhorší v těch základních metrikách. Gradienty jsou ještě špatně naprogramované (počet vrstev a názvy), ale jinak by měli fungovat. Zajímavé je, že druhá vrstva u uniform inicialiźace má oproti ostatním inicializacím velmi vysoké std a zároveň u poslední vrstvy dokonce má NaN jako mean a std. Očividně metrika top5 má nízké variace mezi inicializacemi. Normal inicializace je výrazně nejlepší v gradientech a jejich variaci. Další čeho si můžeme všimnout je fakt, že gradienty obecně, neboli jak mean, meadian, tak i std, tak se výrazně liší svojí magnitudou mezi vrstvami. Zatímco v první vrstvě jsou čísla velmi vysoká, tak v druhé jsou zase naopak velmi nízká.