In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torchvision.transforms import v2
from torch.backends import cudnn
from torch import GradScaler
from torch import optim
from tqdm import tqdm
import numpy as np
import pickle
import time

from torchvision.datasets import CIFAR10, CIFAR100, MNIST, OxfordIIITPet
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
import timm
import wandb
wandb.login(key=os.getenv('WANDB_API_KEY'))
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter


print("Muon" in dir(torch.optim))
print(torch.__version__)
device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device("cpu")
enable_half = device.type != "cpu"
# scaler = torch.amp.GradScaler('cuda', enabled=enable_half)

print("Grad scaler is enabled:", enable_half)
print("Device:", device)

batch_size_map = {
    "resnet18": 128,
    "resnet50": 64,
    "resnest14d": 32,
    "resnest26d": 16,
    "mlp": 256
}   


class BatchSizeScheduler:
    def __init__(self, initial_batch_size, max_batch_size, step_size=30):
        self.initial_batch_size = initial_batch_size
        self.current_batch_size = initial_batch_size
        self.max_batch_size = max_batch_size
        self.step_size = step_size
        self.epoch = 0
    
    def step(self):
        self.epoch += 1
        if self.epoch % self.step_size == 0 and self.current_batch_size < self.max_batch_size:
            old_bs = self.current_batch_size
            self.current_batch_size = min(int(self.current_batch_size * 1.5), self.max_batch_size)
            if self.current_batch_size != old_bs:
                print(f"Batch size increased: {old_bs} → {self.current_batch_size}")
                return True
        return False
    
    def get_batch_size(self):
        return self.current_batch_size


def get_transforms(dataset, image_size, is_train=True, use_heavy_aug=False):
    if dataset == "MNIST":
        return v2.Compose([
            v2.ToImage(),
            v2.Resize(image_size),
            v2.ToDtype(torch.float32, scale=True),
        ])
    
    if dataset in ["CIFAR10", "CIFAR100"]:
        mean = (0.5071, 0.4867, 0.4408)
        std = (0.2675, 0.2565, 0.2761)
    else:  # OxfordIIITPet
        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
    
    if is_train:
        if image_size == 32:
            transforms_list = [
                v2.ToImage(),
                v2.RandomCrop(32, padding=4),
                v2.RandomHorizontalFlip(),
            ]
            if use_heavy_aug:
                transforms_list.append(v2.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3)))
            transforms_list.extend([
                v2.ToDtype(torch.float32, scale=True),
                v2.Normalize(mean, std)
            ])
            return v2.Compose(transforms_list)
        else:
            return v2.Compose([
                v2.ToImage(),
                v2.Resize(256),
                v2.RandomCrop(image_size),
                v2.RandomHorizontalFlip(),
                v2.ToDtype(torch.float32, scale=True),
                v2.Normalize(mean, std)
            ])
    else:
        return v2.Compose([
            v2.ToImage(),
            v2.Resize(image_size),
            v2.CenterCrop(image_size),
            v2.ToDtype(torch.float32, scale=True),
            v2.Normalize(mean, std)
        ])


# DATA_ROOT = "/kaggle/working/data"
# os.makedirs(DATA_ROOT, exist_ok=True)

def get_data_loaders(dataset_name, image_size, batch_size, num_workers=2, pin_memory=True, use_heavy_aug=False):
    train_transforms = get_transforms(dataset_name, image_size, is_train=True, use_heavy_aug=use_heavy_aug)
    test_transform = get_transforms(dataset_name, image_size, is_train=False)

    if dataset_name == "CIFAR100":
        train_dataset= CIFAR100(root='./data', train=True, download=True, transform=train_transforms)
        test_dataset = CIFAR100(root='./data', train=False, download=True, transform=test_transform)
    elif dataset_name == "CIFAR10":
        train_dataset= CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
        test_dataset = CIFAR10(root='./data', train=False, download=True, transform=test_transform)
    elif dataset_name == "MNIST":
        train_dataset= MNIST(root='./data', train=True, download=True, transform=train_transforms)
        test_dataset = MNIST(root='./data', train=False, download=True, transform=test_transform)
    elif dataset_name == "OxfordIIITPet":
        train_dataset= OxfordIIITPet(root='./data', download=True, transform=train_transforms)
        test_dataset = OxfordIIITPet(root='./data', split='test', download=True, transform=test_transform)
    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
    return train_loader, test_loader



def get_num_classes(dataset_name):
    if dataset_name in ["CIFAR100"]:
        return 100
    elif dataset_name in ["CIFAR10"]:
        return 10
    elif dataset_name in ["MNIST"]:
        return 10
    elif dataset_name in ["OxfordIIITPet"]:
        return 37
    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")


ALLOWED_MODELS = ["resnet18", "resnet50", "resnest14d", "resnest26d", "mlp"]

def create_model(model_name, dataset_name, pretrained=False, image_size=32):
    if model_name not in ALLOWED_MODELS:
        raise ValueError(f"Unsupported model: {model_name}")
    
    num_classes=get_num_classes(dataset_name)

    if model_name == "mlp":
        in_channels = 1 if dataset_name == "MNIST" else 3
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_channels * image_size * image_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
    else:
        model = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)
        if not pretrained and image_size == 32:
            if model_name.startswith("resnet") or model_name.startswith("resnest"):
                model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
                model.maxpool = nn.Identity()
    return model


class SAM(torch.optim.Optimizer):
    def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs):
        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
        self.rho = rho
        self.adaptive = adaptive

        defaults = dict(rho=rho, adaptive=adaptive, **kwargs)
        super(SAM, self).__init__(params, defaults)

        self.base_optimizer = base_optimizer(self.param_groups, **kwargs)

    @torch.no_grad()
    def first_step(self, zero_grad=False):
        grad_norm = self._grad_norm()

        for group in self.param_groups:
            scale = group["rho"] / (grad_norm + 1e-12)

            for p in group["params"]:
                if p.grad is None:
                    continue
                e = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale
                p.add_(e)
                self.state[p]["e"] = e

        if zero_grad:
            self.zero_grad()

    @torch.no_grad()
    def second_step(self, zero_grad=False):
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                p.sub_(self.state[p]["e"])

        self.base_optimizer.step()

        if zero_grad:
            self.zero_grad()

    def zero_grad(self):
        self.base_optimizer.zero_grad()

    def step(self):
        raise RuntimeError("Use first_step() and second_step() with SAM.")
    
    def _grad_norm(self):
        norm = torch.norm(
            torch.stack([
                ((torch.pow(p, 2) if self.adaptive else 1.0) * p.grad).norm(p=2)
                for group in self.param_groups for p in group["params"]
                if p.grad is not None
            ]),
            p=2
        )
        return norm


def create_optimizer(optimizer_name, model_parameters, lr, weight_decay):
    if optimizer_name == "SGD":
        return optim.SGD(model_parameters, lr=lr, momentum=0.9, weight_decay=weight_decay)
    elif optimizer_name == "AdamW":
        return optim.AdamW(model_parameters, lr=lr, weight_decay=weight_decay)
    elif optimizer_name == "Adam":
        return optim.Adam(model_parameters, lr=lr, weight_decay=weight_decay)
    elif optimizer_name == "SAM":
        return SAM(model_parameters, base_optimizer=optim.SGD, lr=lr, momentum=0.9, weight_decay=weight_decay)
    elif optimizer_name == "Muon":
        return torch.optim.Muon(model_parameters, lr=lr, weight_decay=weight_decay)        
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")


def create_scheduler(scheduler_name, optimizer, step_size=30):
    base_optimizer = optimizer.base_optimizer if isinstance(optimizer, SAM) else optimizer

    if scheduler_name == "StepLR":
        return StepLR(base_optimizer, step_size=step_size, gamma=0.1)
    elif scheduler_name == "ReduceLROnPlateau":
        return ReduceLROnPlateau(base_optimizer, patience=5)
    else:
        raise ValueError(f"Unsupported scheduler: {scheduler_name}")



def train_epoch(model, train_loader, criterion, optimizer, scaler, device, use_amp, 
                mixup_cutmix_transform=None):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        if mixup_cutmix_transform is not None:
            inputs, targets = mixup_cutmix_transform(inputs, targets)

        with torch.amp.autocast(device_type=device.type, enabled=use_amp):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        if isinstance(optimizer, SAM):
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer.base_optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.first_step(zero_grad=True)

            with torch.amp.autocast(device_type=device.type, enabled=use_amp):
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer.base_optimizer)
            scaler.step(optimizer.base_optimizer)
            scaler.update()
            optimizer.zero_grad()
        else: 
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update() 
            optimizer.zero_grad()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        if targets.dim() > 1:  # Mixed labels (one-hot/ soft labels)
            correct += predicted.eq(targets.argmax(1)).sum().item()
        else:  # Regular 
            correct += predicted.eq(targets).sum().item()

    return train_loss / len(train_loader), 100.0 * correct/total


def test(model, test_loader, criterion, device, use_amp):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            with torch.amp.autocast(device_type=device.type, enabled=use_amp):
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    return test_loss / len(test_loader), 100.0 * correct/total


def train_model(config):
    label_smoothing = config.get("label_smoothing", 0.0)
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    scaler = torch.amp.GradScaler(device.type, enabled=config["use_amp"])
    writer = SummaryWriter(log_dir=f'./logs/{config["dataset"]}/{config["optimizer"]}_{config["model"]}_bs{batch_size_map[config["model"]]}_lr{config["lr"]}_sched{config["scheduler"]}')

    image_size = 224 if config["pretrained"] else 32
    initial_batch_size = batch_size_map[config["model"]]
    
    use_bs_scheduler = config.get("use_batch_size_scheduler", False)
    if use_bs_scheduler:
        bs_scheduler = BatchSizeScheduler(
            initial_batch_size=initial_batch_size // 2, 
            max_batch_size=initial_batch_size,
            step_size=30
        )
        batch_size = bs_scheduler.get_batch_size()
        print(f"Batch size scheduler enabled: {batch_size} → {initial_batch_size}")
    else:
        batch_size = initial_batch_size

    use_heavy_aug = config.get("use_heavy_aug", False)
    train_loader, test_loader = get_data_loaders(config["dataset"], image_size, batch_size, config["num_workers"], config["pin_memory"], use_heavy_aug=use_heavy_aug) 
    model = create_model(config["model"], config["dataset"], config["pretrained"], image_size)
    model.to(device)

    optimizer = create_optimizer(config["optimizer"], model.parameters(), config["lr"], config["weight_decay"])
    
    step_size = config.get("step_size", 30)
    scheduler = create_scheduler(config["scheduler"], optimizer, step_size=step_size)

    epochs = config["epochs"]
    
    best_acc = 0.0
    best_acc_epoch = None
    counter = 0
    patience = config.get("patience", 10)

    mixup_cutmix_transform = None
    use_mixup = config.get("use_mixup", False)
    use_cutmix = config.get("use_cutmix", False)
    
    if use_mixup or use_cutmix:
        num_classes = get_num_classes(config["dataset"])
        transforms_list = []
        
        if use_mixup:
            mixup_alpha = config.get("mixup_alpha", 0.2)
            transforms_list.append(v2.MixUp(alpha=mixup_alpha, num_classes=num_classes))
        
        if use_cutmix:
            cutmix_alpha = config.get("cutmix_alpha", 0.2)
            transforms_list.append(v2.CutMix(alpha=cutmix_alpha, num_classes=num_classes))
        
        if len(transforms_list) > 1:
            mixup_cutmix_transform = v2.RandomChoice(transforms_list)
        else:
            mixup_cutmix_transform = transforms_list[0]

    start_time = time.time()
    print("Start training")
    
    for epoch in range(epochs):
        epoch_start = time.time()

        if use_bs_scheduler and bs_scheduler.step():
            batch_size = bs_scheduler.get_batch_size()
            train_loader, test_loader = get_data_loaders(
                config["dataset"], 
                image_size, 
                batch_size,
                config.get("num_workers", 2),
                config.get("pin_memory", True),
                use_heavy_aug=use_heavy_aug
            )

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scaler, device, config["use_amp"],
                                           mixup_cutmix_transform=mixup_cutmix_transform)
        test_loss, test_acc = test(model, test_loader, criterion, device, config["use_amp"])

        if isinstance(scheduler, StepLR):
            scheduler.step()
        elif isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(test_loss)
        
        if isinstance(optimizer, SAM):
            current_lr = optimizer.base_optimizer.param_groups[0]['lr']
        else:
            current_lr = optimizer.param_groups[0]['lr']
        
        epoch_time = time.time() - epoch_start

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%, LR: {current_lr:.6f}, Time: {epoch_time:.2f}s")
        
        if config["use_wandb"]:
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": train_loss,
                "train_acc": train_acc,
                "val_loss": test_loss,
                "val_acc": test_acc,
                "lr": current_lr,
                "epoch_time": epoch_time,
                "batch_size": batch_size if use_bs_scheduler else initial_batch_size
            })

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Loss/Test', test_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_acc, epoch)
        writer.add_scalar('Accuracy/Test', test_acc, epoch)
        writer.add_scalar('Learning Rate', current_lr, epoch)
        if use_bs_scheduler:
            writer.add_scalar('Batch_Size', batch_size, epoch)
        
        if test_acc > best_acc:
            best_acc = test_acc
            os.makedirs('./checkpoints', exist_ok=True)

            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_acc': best_acc,    
                'config': config    
            }, f'./checkpoints/{config["optimizer"]}_{config["model"]}_bs{batch_size_map[config["model"]]}_lr{config["lr"]}_sched{config["scheduler"]}.pth')
            print(f"Best model saved with accuracy: {best_acc:.2f}%")

        # Early stop mechanism
        if best_acc_epoch is None:
            best_acc_epoch = test_acc
        elif test_acc < best_acc_epoch:
            counter += 1
            if counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break
        else:
            best_acc_epoch = test_acc
            counter = 0

    total_time = time.time() - start_time
    print(f"Training completed in {total_time/60:.2f} minutes")
    print(f"Best Test Accuracy: {best_acc:.2f}%")
    
    writer.close()

    if config["use_wandb"]:
        wandb.log({
            "best_acc": best_acc,
            "total_time": total_time
        })

    return {
        'best_acc': best_acc,
        'total_time': total_time,
        'final_train_acc': train_acc,
        'final_test_acc': test_acc
    }


def create_sweep_config(pretrained=False):
    if pretrained:
        return {
            "method": "grid",
            "metric": {"name": "val_acc", "goal": "maximize"},
            "parameters": {
                "dataset": {"value": "CIFAR100"},
                "model": {"values": ["resnet18", "resnet50"]},
                "pretrained": {"value": True}, 
                "lr": {"values": [3e-4, 1e-4]},
                "optimizer": {"value": "AdamW"},  
                "scheduler": {"value": "StepLR"},  
                "epochs": {"value": 100},
                "weight_decay": {"value": 1e-2},
                "num_workers": {"value": 2},  
                "pin_memory": {"value": True},
                "use_amp": {"value": True},  
                "use_wandb": {"value": True},
                "use_batch_size_scheduler": {"value": False}
            }
        }
    else:
        return {
            "method": "grid",
            "metric": {"name": "val_acc", "goal": "maximize"},
            "parameters": {
                "dataset": {"value": "CIFAR100"},
                "model": {"values": ["resnet50"]},
                "pretrained": {"value": False}, 
                "lr": {"values": [0.1]},  
                "optimizer": {"values": ["SGD"]},
                "scheduler": {"value": "StepLR"},
                "step_size": {"value": 50},
                "epochs": {"value": 200},
                "weight_decay": {"value": 5e-4},
                "num_workers": {"value": 2}, 
                "pin_memory": {"value": True},
                "use_amp": {"value": True}, 
                "use_wandb": {"value": True},
                "use_batch_size_scheduler": {"value": False},
                "use_heavy_aug": {"value": True},
                "use_mixup": {"value": True},
                "mixup_alpha": {"value": 0.2},
                "use_cutmix": {"value": True},
                "cutmix_alpha": {"value": 0.2},
                "label_smoothing": {"value": 0.1},
                "patience": {"value": 20}
            }
        }


def sweep_train():
    run = wandb.init(project="training_pipeline")
    cfg = dict(wandb.config)
    results = train_model(cfg)
    run.finish()
    

sweep_config = create_sweep_config(pretrained=False)
sweep_id = wandb.sweep(sweep_config, project="training_pipeline")
wandb.agent(sweep_id, function=sweep_train)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc


True
2.9.1+cu126
Grad scaler is enabled: True
Device: cuda
Create sweep with ID: wkum883d
Sweep URL: https://wandb.ai/stefan-dorian-gavril-universitatea-alexandru-ioan-cuza-d/training_pipeline/sweeps/wkum883d


[34m[1mwandb[0m: Agent Starting Run: c5qttuhi with config:
[34m[1mwandb[0m: 	cutmix_alpha: 0.2
[34m[1mwandb[0m: 	dataset: CIFAR100
[34m[1mwandb[0m: 	epochs: 200
[34m[1mwandb[0m: 	label_smoothing: 0.1
[34m[1mwandb[0m: 	lr: 0.1
[34m[1mwandb[0m: 	mixup_alpha: 0.2
[34m[1mwandb[0m: 	model: resnet50
[34m[1mwandb[0m: 	num_workers: 2
[34m[1mwandb[0m: 	optimizer: SGD
[34m[1mwandb[0m: 	patience: 20
[34m[1mwandb[0m: 	pin_memory: True
[34m[1mwandb[0m: 	pretrained: False
[34m[1mwandb[0m: 	scheduler: StepLR
[34m[1mwandb[0m: 	step_size: 50
[34m[1mwandb[0m: 	use_amp: True
[34m[1mwandb[0m: 	use_batch_size_scheduler: False
[34m[1mwandb[0m: 	use_cutmix: True
[34m[1mwandb[0m: 	use_heavy_aug: True
[34m[1mwandb[0m: 	use_mixup: True
[34m[1mwandb[0m: 	use_wandb: True
[34m[1mwandb[0m: 	weight_decay: 0.0005


Start training
Epoch 1/200, Train Loss: 4.3629, Train Acc: 4.45%, Test Loss: 3.9985, Test Acc: 10.63%, LR: 0.100000, Time: 64.14s
Best model saved with accuracy: 10.63%
Epoch 2/200, Train Loss: 4.0911, Train Acc: 7.20%, Test Loss: 3.7444, Test Acc: 14.93%, LR: 0.100000, Time: 63.51s
Best model saved with accuracy: 14.93%
Epoch 3/200, Train Loss: 3.8897, Train Acc: 10.44%, Test Loss: 3.4718, Test Acc: 21.08%, LR: 0.100000, Time: 63.58s
Best model saved with accuracy: 21.08%
Epoch 4/200, Train Loss: 3.6674, Train Acc: 13.96%, Test Loss: 3.2511, Test Acc: 26.06%, LR: 0.100000, Time: 63.67s
Best model saved with accuracy: 26.06%
Epoch 5/200, Train Loss: 3.5237, Train Acc: 17.16%, Test Loss: 3.2395, Test Acc: 26.91%, LR: 0.100000, Time: 63.55s
Best model saved with accuracy: 26.91%
Epoch 6/200, Train Loss: 3.4358, Train Acc: 18.65%, Test Loss: 3.0435, Test Acc: 32.83%, LR: 0.100000, Time: 63.57s
Best model saved with accuracy: 32.83%
Epoch 7/200, Train Loss: 3.3761, Train Acc: 19.06%, Test 

0,1
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
best_acc,▁
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇█████
epoch_time,█▇▄▃▂▃▃▂▂▃▂▂▃▃▃▂▂▂▂▂▄▂▂▃▂▃▆▆▄▄▂▂▂▂▁▁▂▃▅▅
lr,██████████▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_time,▁
train_acc,▁▁▁▂▂▂▂▂▂▂▂▂▄▄▄▅▅▅▅▅▅▅▆▇▇▇▇▇▇▇▇▇█▇▇▇▇█▇▇
train_loss,█▆▆▆▆▆▅▅▅▅▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▁▂▂▂▃▃▃▃▆▆▆▇▆▇▆▇▇▇▇████████████████████
val_loss,█▇▅▄▄▄▅▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch_size,64.0
best_acc,81.3
epoch,192.0
epoch_time,63.29941
lr,0.0001
total_time,12146.76687
train_acc,59.26
train_loss,1.61122
val_acc,80.71
val_loss,1.4235


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
