In [1]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
from transformers import ViTFeatureExtractor

model_ckpt = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_ckpt)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
images_root = '../Affectnet/Manually_Annotated/Manually_Annotated_Images'

In [4]:
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image
import os

def pil_loader(path):
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')


class AffectNetDataset(Dataset):
    def __init__(self,
                 csv_file,
                 root,
                 mode='classification',
                 crop=False,
                 transform=None,
                 invalid_files=None):
        assert mode in ('valence', 'arousal', 'valence-arousal', 'classification')
        self.df = pd.read_csv(csv_file)
        self.root = root
        self.mode = mode
        self.crop = crop
        self.transform = transform
        self.invalid_files = invalid_files
        
        if self.invalid_files:
            self.df = self.df[~self.df['subDirectory_filePath'].isin(invalid_files)]
        
        self.df = self.df[~((self.df['expression'] == 9) | (self.df['expression'] == 10))].reset_index(drop=True)
    
    def __getitem__(self, idx):
        try:
            img = pil_loader(os.path.join(self.root, self.df['subDirectory_filePath'][idx]))
        except KeyError:
            raise IndexError
        if self.crop:
            img = img.crop((self.df['face_x'][idx],
                            self.df['face_y'][idx],
                            self.df['face_x'][idx]+self.df['face_width'][idx],
                            self.df['face_y'][idx]+self.df['face_height'][idx],))
        if self.transform:
            img = self.transform(img)
        if self.mode == 'classification':
            target = torch.tensor(self.df['expression'][idx])
        elif self.mode == 'valence':
            target = torch.tensor([self.df['valence'][idx]])
        elif self.mode == 'arousal':
            target = torch.tensor([self.df['arousal'][idx]])
        else:
            target = torch.tensor([self.df['valence'][idx],
                                   self.df['arousal'][idx]])
        return img.float(), target.float()

    def __len__(self):
        return len(self.df)

In [5]:
def collate_fn(examples):
    imgs, targets = zip(*examples)
    pixel_values = torch.stack(imgs)
    targets = torch.stack(targets)
    return {'pixel_values': pixel_values, 'labels': targets}

In [6]:
train_invalid_files = ['103/29a31ebf1567693f4644c8ba3476ca9a72ee07fe67a5860d98707a0a.jpg']
val_invalid_files = []

In [7]:
mode = 'valence-arousal'

In [8]:
from torchvision.transforms import (Compose,
                                    Normalize,
                                    Resize,
                                    ToTensor)

normalize = Normalize(mean=feature_extractor.image_mean,
                      std=feature_extractor.image_std)

transform = Compose([Resize(tuple(feature_extractor.size.values())),
                     ToTensor()])

train_dataset = AffectNetDataset('../Affectnet/training.csv',
                                 images_root,
                                 mode,
                                 transform=transform,
                                 invalid_files=train_invalid_files)
val_dataset = AffectNetDataset('../Affectnet/validation.csv',
                               images_root,
                               mode,
                               transform=transform,
                               invalid_files=val_invalid_files)

print('train:', len(train_dataset))
print('validation:', len(val_dataset))

train: 320739
validation: 4500


In [9]:
from transformers import Trainer
from KDEweightedMSE.losses import KDEWeightedMSESc

class CustomTrainer(Trainer):
    def __init__(self,
                 band_width = None,
                 model = None,
                 args = None,
                 data_collator = None,
                 train_dataset = None,
                 eval_dataset = None,
                 tokenizer = None,
                 model_init = None,
                 compute_metrics = None,
                 callbacks = None,
                 optimizers = (None, None),
                 preprocess_logits_for_metrics = None):
        super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
        
        data = train_dataset.df[['valence', 'arousal']]
        self.loss_fct = KDEWeightedMSESc(data=data, band_width=band_width, device=self.args.device, mode='divide', standardize=False)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

def compute_metrics(eval_pred):
    preds, targets = eval_pred
    mse = mean_squared_error(targets, preds, squared=True)
    rmse = mean_squared_error(targets, preds, squared=False)
    return {'mse': mse, 'rmse': rmse}

In [11]:
import optuna
import wandb
from transformers import ViTForImageClassification, EarlyStoppingCallback, TrainingArguments

def objective(trial: optuna.Trial):
    band_width = trial.suggest_float('band_width', low=0.01, high=0.5)
    print('-'*20)
    print('bw=', band_width)
    print('-'*20)
    
    model = ViTForImageClassification.from_pretrained(
        model_ckpt,
        num_labels=2,
        problem_type='regression'
    )
    args = TrainingArguments(
        f"nonstd-divide-bw={band_width}",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=30,
        weight_decay=1e-3,
        load_best_model_at_end=True,
        logging_dir='logs',
        logging_strategy='steps',
        logging_steps=1000,
        remove_unused_columns=False,
    )
    trainer = CustomTrainer(
        band_width=band_width,
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        tokenizer=feature_extractor,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0005)],
    )
    
    result = trainer.train()
    trainer.save_state()
    trainer.save_model()
    val_result = trainer.predict(val_dataset)
    print(val_result.metrics)
    return val_result.metrics['test_rmse']

In [12]:
n_trials = 10

In [13]:
study = optuna.create_study(study_name='nonstd-divide', direction='minimize')
study.optimize(func=objective, n_trials=n_trials)

[32m[I 2023-05-03 20:07:01,439][0m A new study created in memory with name: nonstd-divide[0m


--------------------
bw= 0.43870230743308963
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of thi

  0%|          | 1000/300720 [12:52<61:28:49,  1.35it/s]

{'loss': 37.6362, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:47<64:37:18,  1.28it/s]

{'loss': 32.5391, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:43<66:01:58,  1.25it/s]

{'loss': 31.4947, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:40<63:21:04,  1.30it/s]

{'loss': 29.3652, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:04:36<62:29:20,  1.31it/s]

{'loss': 28.9396, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:17:31<62:00:33,  1.32it/s]

{'loss': 27.8862, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:30:26<63:24:36,  1.29it/s]

{'loss': 28.5446, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:43:21<62:25:13,  1.30it/s]

{'loss': 28.0238, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:56:16<61:31:41,  1.32it/s]

{'loss': 27.004, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:09:10<63:46:00,  1.27it/s]

{'loss': 27.5103, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


                                                           
  3%|▎         | 10024/300720 [2:10:37<53:23:32,  1.51it/s]

{'eval_loss': 53.941856384277344, 'eval_mse': 0.15116076171398163, 'eval_rmse': 0.38862133026123047, 'eval_runtime': 68.8542, 'eval_samples_per_second': 65.355, 'eval_steps_per_second': 2.048, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:23:00<61:03:31,  1.32it/s]  

{'loss': 25.713, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:35:40<61:56:55,  1.29it/s]

{'loss': 25.3824, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:48:21<60:51:05,  1.31it/s]

{'loss': 24.8425, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:01:04<59:31:37,  1.34it/s]

{'loss': 24.5381, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:46<62:37:32,  1.27it/s]

{'loss': 25.1342, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:26:31<59:06:15,  1.34it/s]

{'loss': 25.1406, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:39:15<59:05:05,  1.33it/s]

{'loss': 24.5323, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:52:02<61:16:26,  1.28it/s]

{'loss': 24.112, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:53<61:52:17,  1.26it/s]

{'loss': 25.2125, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:48<57:49:30,  1.35it/s]

{'loss': 24.7747, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


                                                           
  7%|▋         | 20048/300720 [4:19:38<50:42:12,  1.54it/s]

{'eval_loss': 53.04206085205078, 'eval_mse': 0.15132611989974976, 'eval_rmse': 0.38895219564437866, 'eval_runtime': 72.6237, 'eval_samples_per_second': 61.963, 'eval_steps_per_second': 1.942, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:45<57:53:11,  1.34it/s]  

{'loss': 21.3888, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:29<57:52:13,  1.34it/s]

{'loss': 21.6604, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:14<58:19:32,  1.32it/s]

{'loss': 21.0545, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:10:04<56:33:53,  1.36it/s]

{'loss': 21.2815, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:52<59:44:26,  1.28it/s]

{'loss': 22.6259, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:38<58:01:47,  1.32it/s]

{'loss': 20.9402, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:25<57:15:59,  1.33it/s]

{'loss': 22.1822, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:01:13<57:15:01,  1.32it/s]

{'loss': 21.6284, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:14:02<61:35:32,  1.23it/s]

{'loss': 21.867, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:55<58:37:45,  1.28it/s]

{'loss': 22.1533, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


                                                           
 10%|█         | 30072/300720 [6:29:03<50:43:12,  1.48it/s]

{'eval_loss': 55.586307525634766, 'eval_mse': 0.16179999709129333, 'eval_rmse': 0.40223556756973267, 'eval_runtime': 72.6267, 'eval_samples_per_second': 61.961, 'eval_steps_per_second': 1.941, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:40:52<57:56:48,  1.29it/s]  

{'loss': 18.3795, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:33<58:34:13,  1.27it/s]

{'loss': 17.6571, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:16<55:50:36,  1.33it/s]

{'loss': 17.5966, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:18:58<59:11:40,  1.25it/s]

{'loss': 18.2902, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:31:41<56:03:33,  1.32it/s]

{'loss': 17.9136, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:27<55:21:22,  1.33it/s]

{'loss': 18.0896, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:18<56:10:52,  1.30it/s]

{'loss': 18.3976, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:11<56:32:20,  1.29it/s]

{'loss': 18.0914, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:05<56:18:32,  1.29it/s]

{'loss': 18.1686, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:36:01<54:26:16,  1.33it/s]

{'loss': 17.8202, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


                                                           
 13%|█▎        | 40096/300720 [8:38:25<49:01:48,  1.48it/s]

{'eval_loss': 57.717533111572266, 'eval_mse': 0.166337251663208, 'eval_rmse': 0.4078369140625, 'eval_runtime': 68.8913, 'eval_samples_per_second': 65.32, 'eval_steps_per_second': 2.047, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:26<56:09:54,  1.29it/s]


{'train_runtime': 31108.928, 'train_samples_per_second': 309.306, 'train_steps_per_second': 9.667, 'train_loss': 23.620907566591157, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.06it/s]
[32m[I 2023-05-04 04:46:41,939][0m Trial 0 finished with value: 0.38895219564437866 and parameters: {'band_width': 0.43870230743308963}. Best is trial 0 with value: 0.38895219564437866.[0m


{'test_loss': 53.04206085205078, 'test_mse': 0.15132611989974976, 'test_rmse': 0.38895219564437866, 'test_runtime': 68.788, 'test_samples_per_second': 65.418, 'test_steps_per_second': 2.05}
--------------------
bw= 0.3361686465382323
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 36.2717, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:30<64:03:20,  1.30it/s]

{'loss': 31.2402, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:15<63:24:11,  1.30it/s]

{'loss': 30.0947, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:01<63:13:04,  1.30it/s]

{'loss': 27.9569, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:46<62:07:05,  1.32it/s]

{'loss': 27.7709, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:31<60:32:48,  1.35it/s]

{'loss': 26.6412, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:21<63:08:25,  1.29it/s]

{'loss': 27.3243, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:13<62:39:43,  1.30it/s]

{'loss': 26.638, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:06<60:25:44,  1.34it/s]

{'loss': 25.5899, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:00<63:25:54,  1.27it/s]

{'loss': 26.3053, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


                                                           
  3%|▎         | 10024/300720 [2:09:29<52:45:19,  1.53it/s]

{'eval_loss': 50.9239387512207, 'eval_mse': 0.15348197519779205, 'eval_rmse': 0.3916822671890259, 'eval_runtime': 70.2034, 'eval_samples_per_second': 64.099, 'eval_steps_per_second': 2.008, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:21:54<61:23:32,  1.31it/s]  

{'loss': 24.692, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:37<62:02:55,  1.29it/s]

{'loss': 24.1806, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:21<60:26:11,  1.32it/s]

{'loss': 23.8244, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:07<60:29:24,  1.32it/s]

{'loss': 23.4858, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:12:52<62:31:06,  1.27it/s]

{'loss': 23.8517, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:25:39<58:55:44,  1.34it/s]

{'loss': 23.8757, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:28<59:21:30,  1.33it/s]

{'loss': 23.4421, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:19<61:05:43,  1.29it/s]

{'loss': 23.2187, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:12<61:22:17,  1.28it/s]

{'loss': 24.2297, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:08<58:45:08,  1.33it/s]

{'loss': 23.6131, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


                                                           
  7%|▋         | 20048/300720 [4:18:56<50:38:07,  1.54it/s]

{'eval_loss': 49.934326171875, 'eval_mse': 0.1517716646194458, 'eval_rmse': 0.38955289125442505, 'eval_runtime': 71.3261, 'eval_samples_per_second': 63.091, 'eval_steps_per_second': 1.977, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:04<57:47:08,  1.34it/s]  

{'loss': 20.6355, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:43:50<58:07:01,  1.33it/s]

{'loss': 20.9196, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:56:34<58:41:19,  1.31it/s]

{'loss': 20.2952, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:21<56:49:55,  1.35it/s]

{'loss': 20.4116, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:09<60:11:20,  1.27it/s]

{'loss': 21.7666, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:34:55<58:07:21,  1.31it/s]

{'loss': 20.1562, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:47:45<57:06:42,  1.33it/s]

{'loss': 21.1367, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:00:35<57:57:18,  1.31it/s]

{'loss': 20.8308, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:13:29<60:43:20,  1.24it/s]

{'loss': 20.8765, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:23<59:18:41,  1.27it/s]

{'loss': 21.2931, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


                                                           
 10%|█         | 30072/300720 [6:28:31<50:32:52,  1.49it/s]

{'eval_loss': 52.43470764160156, 'eval_mse': 0.16176244616508484, 'eval_rmse': 0.4021814465522766, 'eval_runtime': 71.6525, 'eval_samples_per_second': 62.803, 'eval_steps_per_second': 1.968, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:40:22<58:16:06,  1.29it/s]  

{'loss': 17.9836, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:04<58:38:06,  1.27it/s]

{'loss': 16.9804, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:05:50<56:39:24,  1.31it/s]

{'loss': 17.1259, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:18:33<58:55:01,  1.26it/s]

{'loss': 17.6942, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:31:13<55:52:30,  1.32it/s]

{'loss': 17.3611, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:43:55<55:22:42,  1.33it/s]

{'loss': 17.529, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:56:39<55:22:24,  1.32it/s]

{'loss': 17.5814, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:09:24<56:07:55,  1.30it/s]

{'loss': 17.456, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:22:14<55:51:09,  1.30it/s]

{'loss': 17.3698, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:35:09<54:44:22,  1.32it/s]

{'loss': 17.2443, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


                                                           
 13%|█▎        | 40096/300720 [8:37:36<48:58:57,  1.48it/s]

{'eval_loss': 52.65519332885742, 'eval_mse': 0.16297416388988495, 'eval_rmse': 0.4036777913570404, 'eval_runtime': 71.6338, 'eval_samples_per_second': 62.82, 'eval_steps_per_second': 1.968, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:37:37<56:04:35,  1.29it/s]


{'train_runtime': 31057.7882, 'train_samples_per_second': 309.815, 'train_steps_per_second': 9.683, 'train_loss': 22.65717652390123, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.07it/s]
[32m[I 2023-05-04 13:25:30,850][0m Trial 1 finished with value: 0.38955289125442505 and parameters: {'band_width': 0.3361686465382323}. Best is trial 0 with value: 0.38895219564437866.[0m


{'test_loss': 49.934326171875, 'test_mse': 0.1517716646194458, 'test_rmse': 0.38955289125442505, 'test_runtime': 68.6053, 'test_samples_per_second': 65.593, 'test_steps_per_second': 2.055}
--------------------
bw= 0.3970538793787303
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 37.6171, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:33<64:09:45,  1.29it/s]

{'loss': 32.2363, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:20<63:39:37,  1.30it/s]

{'loss': 31.1318, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:07<62:16:38,  1.32it/s]

{'loss': 28.9168, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:54<61:59:26,  1.33it/s]

{'loss': 28.6688, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:41<61:21:11,  1.33it/s]

{'loss': 27.3919, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:32<63:31:46,  1.28it/s]

{'loss': 28.1131, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:24<62:45:05,  1.30it/s]

{'loss': 27.5211, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:19<60:46:31,  1.33it/s]

{'loss': 26.4832, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:14<62:49:33,  1.29it/s]

{'loss': 26.9886, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:32<53:00:09,  1.52it/s]
  3%|▎         | 10024/300720 [2:09:42<53:00:09,  1.52it/s]

{'eval_loss': 52.371002197265625, 'eval_mse': 0.15125057101249695, 'eval_rmse': 0.388793408870697, 'eval_runtime': 70.395, 'eval_samples_per_second': 63.925, 'eval_steps_per_second': 2.003, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:10<61:20:38,  1.31it/s]  

{'loss': 25.3348, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:55<62:23:16,  1.29it/s]

{'loss': 25.0192, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:41<60:37:55,  1.32it/s]

{'loss': 24.4393, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:29<60:12:56,  1.32it/s]

{'loss': 24.2265, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:15<63:18:46,  1.25it/s]

{'loss': 24.644, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:26:03<59:06:49,  1.34it/s]

{'loss': 24.6143, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:54<59:55:54,  1.32it/s]

{'loss': 24.2114, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:47<60:52:54,  1.29it/s]

{'loss': 23.8836, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:41<61:50:37,  1.27it/s]

{'loss': 24.9647, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:38<57:56:53,  1.35it/s]

{'loss': 24.4519, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:18:15<51:06:01,  1.53it/s]
  7%|▋         | 20048/300720 [4:19:27<51:06:01,  1.53it/s]

{'eval_loss': 52.18012619018555, 'eval_mse': 0.15158593654632568, 'eval_rmse': 0.38933631777763367, 'eval_runtime': 72.1138, 'eval_samples_per_second': 62.401, 'eval_steps_per_second': 1.955, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:33<57:53:26,  1.34it/s]  

{'loss': 21.1839, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:17<57:46:40,  1.34it/s]

{'loss': 21.4192, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:00<58:42:53,  1.31it/s]

{'loss': 20.7959, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:46<56:42:59,  1.36it/s]

{'loss': 20.9733, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:33<59:00:30,  1.30it/s]

{'loss': 22.4309, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:18<58:04:56,  1.31it/s]

{'loss': 20.6738, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:09<57:44:46,  1.32it/s]

{'loss': 21.6938, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:00:58<57:42:12,  1.31it/s]

{'loss': 21.4486, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:13:52<60:32:23,  1.25it/s]

{'loss': 21.6542, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:46<59:11:04,  1.27it/s]

{'loss': 21.9124, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:27:42<50:43:43,  1.48it/s]
 10%|█         | 30072/300720 [6:28:54<50:43:43,  1.48it/s]

{'eval_loss': 54.39723587036133, 'eval_mse': 0.16134241223335266, 'eval_rmse': 0.4016731083393097, 'eval_runtime': 72.0183, 'eval_samples_per_second': 62.484, 'eval_steps_per_second': 1.958, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:40:48<59:00:24,  1.27it/s]  

{'loss': 18.3998, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:31<59:03:42,  1.26it/s]

{'loss': 17.5221, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:18<56:29:22,  1.32it/s]

{'loss': 17.6102, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:19:03<58:54:16,  1.26it/s]

{'loss': 18.1403, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:31:49<55:51:11,  1.32it/s]

{'loss': 17.7948, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:36<55:27:59,  1.33it/s]

{'loss': 17.8504, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:26<56:51:10,  1.29it/s]

{'loss': 18.1596, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:17<55:56:40,  1.30it/s]

{'loss': 17.9744, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:09<56:36:47,  1.28it/s]

{'loss': 17.8924, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:36:06<54:12:15,  1.34it/s]

{'loss': 17.7186, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:37:21<48:50:23,  1.48it/s]
 13%|█▎        | 40096/300720 [8:38:33<48:50:23,  1.48it/s]

{'eval_loss': 55.67026901245117, 'eval_mse': 0.16443559527397156, 'eval_rmse': 0.4054912328720093, 'eval_runtime': 72.1626, 'eval_samples_per_second': 62.359, 'eval_steps_per_second': 1.954, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:35<56:10:47,  1.29it/s]


{'train_runtime': 31115.029, 'train_samples_per_second': 309.245, 'train_steps_per_second': 9.665, 'train_loss': 23.337310121236378, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.07it/s]
[32m[I 2023-05-04 22:05:16,844][0m Trial 2 finished with value: 0.38933631777763367 and parameters: {'band_width': 0.3970538793787303}. Best is trial 0 with value: 0.38895219564437866.[0m


{'test_loss': 52.18012619018555, 'test_mse': 0.15158593654632568, 'test_rmse': 0.38933631777763367, 'test_runtime': 68.5796, 'test_samples_per_second': 65.617, 'test_steps_per_second': 2.056}
--------------------
bw= 0.47220745499961636
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 39.1858, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:31<64:13:25,  1.29it/s]

{'loss': 33.298, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:16<64:03:46,  1.29it/s]

{'loss': 32.032, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:03<62:22:48,  1.32it/s]

{'loss': 29.9031, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  1%|▏         | 4394/300720 [56:07<61:31:04,  1.34it/s]wandb: Network error (ConnectTimeout), entering retry loop.
  2%|▏         | 5000/300720 [1:03:52<61:34:53,  1.33it/s]

{'loss': 29.5055, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:40<61:46:55,  1.33it/s]

{'loss': 28.2637, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:32<63:23:25,  1.29it/s]

{'loss': 28.9193, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:26<62:29:39,  1.30it/s]

{'loss': 28.3536, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:22<61:31:40,  1.32it/s]

{'loss': 27.4053, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:17<62:52:51,  1.28it/s]

{'loss': 27.808, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:35<53:13:46,  1.52it/s]
  3%|▎         | 10024/300720 [2:09:47<53:13:46,  1.52it/s]

{'eval_loss': 54.44356918334961, 'eval_mse': 0.15015056729316711, 'eval_rmse': 0.38740551471710205, 'eval_runtime': 71.3935, 'eval_samples_per_second': 63.031, 'eval_steps_per_second': 1.975, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:14<61:29:19,  1.31it/s]  

{'loss': 26.1309, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:59<61:50:39,  1.30it/s]

{'loss': 25.7498, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:45<60:18:39,  1.33it/s]

{'loss': 25.2267, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:34<61:07:45,  1.30it/s]

{'loss': 24.938, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:19<63:16:17,  1.25it/s]

{'loss': 25.3493, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:26:07<59:16:50,  1.33it/s]

{'loss': 25.4752, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:57<59:27:59,  1.33it/s]

{'loss': 24.9843, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:49<61:08:32,  1.28it/s]

{'loss': 24.5781, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:43<61:47:57,  1.27it/s]

{'loss': 25.6925, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:41<59:02:17,  1.32it/s]

{'loss': 25.2771, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:18:18<51:01:36,  1.53it/s]
  7%|▋         | 20048/300720 [4:19:30<51:01:36,  1.53it/s]

{'eval_loss': 54.47431182861328, 'eval_mse': 0.1515289843082428, 'eval_rmse': 0.38923683762550354, 'eval_runtime': 72.0512, 'eval_samples_per_second': 62.456, 'eval_steps_per_second': 1.957, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:40<57:51:28,  1.34it/s]  

{'loss': 21.8208, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:28<58:21:11,  1.33it/s]

{'loss': 22.2068, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:14<58:46:42,  1.31it/s]

{'loss': 21.5086, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:10:01<56:18:53,  1.36it/s]

{'loss': 21.7116, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:49<59:03:45,  1.30it/s]

{'loss': 23.0261, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:35<58:06:24,  1.31it/s]

{'loss': 21.3445, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:27<58:22:17,  1.30it/s]

{'loss': 22.427, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:01:18<58:37:03,  1.29it/s]

{'loss': 22.0883, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:14:12<60:58:04,  1.24it/s]

{'loss': 22.392, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:27:07<59:02:31,  1.27it/s]

{'loss': 22.6734, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:28:03<51:01:59,  1.47it/s]
 10%|█         | 30072/300720 [6:29:14<51:01:59,  1.47it/s]

{'eval_loss': 56.950645446777344, 'eval_mse': 0.16156449913978577, 'eval_rmse': 0.40194398164749146, 'eval_runtime': 71.7533, 'eval_samples_per_second': 62.715, 'eval_steps_per_second': 1.965, 'epoch': 3.0}


 10%|█         | 30072/300720 [6:29:16<58:23:28,  1.29it/s]


{'train_runtime': 23356.4651, 'train_samples_per_second': 411.97, 'train_steps_per_second': 12.875, 'train_loss': 25.971493285905492, 'epoch': 3.0}


100%|██████████| 141/141 [01:07<00:00,  2.07it/s]
[32m[I 2023-05-05 04:35:43,759][0m Trial 3 finished with value: 0.38740551471710205 and parameters: {'band_width': 0.47220745499961636}. Best is trial 3 with value: 0.38740551471710205.[0m


{'test_loss': 54.44356918334961, 'test_mse': 0.15015056729316711, 'test_rmse': 0.38740551471710205, 'test_runtime': 68.4721, 'test_samples_per_second': 65.72, 'test_steps_per_second': 2.059}
--------------------
bw= 0.37551314244462536
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 36.1354, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:32<63:58:54,  1.30it/s]

{'loss': 31.5653, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:17<64:14:14,  1.29it/s]

{'loss': 30.7509, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:07<62:23:15,  1.32it/s]

{'loss': 28.3988, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:57<62:22:31,  1.32it/s]

{'loss': 28.1374, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:46<61:36:11,  1.33it/s]

{'loss': 27.2669, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:37<63:34:36,  1.28it/s]

{'loss': 27.9284, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:30<63:26:09,  1.28it/s]

{'loss': 27.2646, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:26<61:11:23,  1.32it/s]

{'loss': 26.3113, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:20<62:49:20,  1.29it/s]

{'loss': 26.7823, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:38<53:32:10,  1.51it/s]
  3%|▎         | 10024/300720 [2:09:49<53:32:10,  1.51it/s]

{'eval_loss': 52.45946502685547, 'eval_mse': 0.1538759171962738, 'eval_rmse': 0.3921625316143036, 'eval_runtime': 70.7735, 'eval_samples_per_second': 63.583, 'eval_steps_per_second': 1.992, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:14<60:50:36,  1.32it/s]  

{'loss': 25.0152, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:56<61:53:03,  1.30it/s]

{'loss': 24.673, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:41<60:42:54,  1.32it/s]

{'loss': 24.2018, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:29<60:32:15,  1.32it/s]

{'loss': 23.9385, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:15<62:30:21,  1.27it/s]

{'loss': 24.343, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:26:03<59:16:51,  1.33it/s]

{'loss': 24.2845, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:52<59:26:01,  1.33it/s]

{'loss': 23.8306, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:45<60:49:19,  1.29it/s]

{'loss': 23.533, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:40<61:47:41,  1.27it/s]

{'loss': 24.6327, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:39<58:38:51,  1.33it/s]

{'loss': 24.0855, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:18:15<50:44:36,  1.54it/s]
  7%|▋         | 20048/300720 [4:19:27<50:44:36,  1.54it/s]

{'eval_loss': 50.398651123046875, 'eval_mse': 0.14951924979686737, 'eval_rmse': 0.38663142919540405, 'eval_runtime': 71.7492, 'eval_samples_per_second': 62.719, 'eval_steps_per_second': 1.965, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:34<57:44:29,  1.35it/s]  

{'loss': 20.7773, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:20<58:01:02,  1.33it/s]

{'loss': 21.072, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:05<58:55:39,  1.31it/s]

{'loss': 20.475, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:54<56:42:17,  1.36it/s]

{'loss': 20.7085, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:42<59:01:47,  1.30it/s]

{'loss': 21.9524, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:28<58:00:24,  1.32it/s]

{'loss': 20.2433, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:18<58:02:20,  1.31it/s]

{'loss': 21.489, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:01:09<57:59:31,  1.31it/s]

{'loss': 21.1006, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:14:04<60:34:43,  1.25it/s]

{'loss': 21.3038, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:59<58:23:50,  1.29it/s]

{'loss': 21.6254, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:27:55<51:12:30,  1.47it/s]
 10%|█         | 30072/300720 [6:29:08<51:12:30,  1.47it/s]

{'eval_loss': 53.15262985229492, 'eval_mse': 0.16055849194526672, 'eval_rmse': 0.40069255232810974, 'eval_runtime': 72.2526, 'eval_samples_per_second': 62.282, 'eval_steps_per_second': 1.951, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:41:01<58:03:07,  1.29it/s]  

{'loss': 18.0961, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:46<59:27:27,  1.26it/s]

{'loss': 17.1502, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:35<56:22:22,  1.32it/s]

{'loss': 17.1606, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:19:21<59:44:06,  1.24it/s]

{'loss': 17.7568, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:32:07<56:53:25,  1.30it/s]

{'loss': 17.5214, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:54<56:23:57,  1.30it/s]

{'loss': 17.5084, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:44<55:54:02,  1.31it/s]

{'loss': 17.9502, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:36<56:36:58,  1.29it/s]

{'loss': 17.511, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:29<55:59:38,  1.30it/s]

{'loss': 17.5422, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:36:25<54:34:58,  1.33it/s]

{'loss': 17.303, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:37:40<48:59:22,  1.48it/s]
 13%|█▎        | 40096/300720 [8:38:52<48:59:22,  1.48it/s]

{'eval_loss': 54.10569763183594, 'eval_mse': 0.16177116334438324, 'eval_rmse': 0.40215736627578735, 'eval_runtime': 72.0306, 'eval_samples_per_second': 62.473, 'eval_steps_per_second': 1.958, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:53<56:12:50,  1.29it/s]


{'train_runtime': 31133.9738, 'train_samples_per_second': 309.057, 'train_steps_per_second': 9.659, 'train_loss': 22.96648795454386, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.07it/s]
[32m[I 2023-05-05 13:15:48,486][0m Trial 4 finished with value: 0.38663142919540405 and parameters: {'band_width': 0.37551314244462536}. Best is trial 4 with value: 0.38663142919540405.[0m


{'test_loss': 50.398651123046875, 'test_mse': 0.14951924979686737, 'test_rmse': 0.38663142919540405, 'test_runtime': 68.7411, 'test_samples_per_second': 65.463, 'test_steps_per_second': 2.051}
--------------------
bw= 0.32983068353578726
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 36.147, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:33<63:30:49,  1.31it/s]

{'loss': 31.0497, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:18<64:20:21,  1.29it/s]

{'loss': 29.9545, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:05<62:40:47,  1.31it/s]

{'loss': 27.8407, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:52<62:40:30,  1.31it/s]

{'loss': 27.6366, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:39<61:21:08,  1.33it/s]

{'loss': 26.4542, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:31<63:33:35,  1.28it/s]

{'loss': 27.219, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:23<63:07:14,  1.29it/s]

{'loss': 26.5602, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:18<61:02:44,  1.33it/s]

{'loss': 25.6004, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:08<63:08:02,  1.28it/s]

{'loss': 26.1689, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:26<53:18:39,  1.51it/s]
  3%|▎         | 10024/300720 [2:09:38<53:18:39,  1.51it/s]

{'eval_loss': 50.326995849609375, 'eval_mse': 0.15170706808567047, 'eval_rmse': 0.3893607258796692, 'eval_runtime': 71.2908, 'eval_samples_per_second': 63.122, 'eval_steps_per_second': 1.978, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:02<60:15:42,  1.34it/s]  

{'loss': 24.4964, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:41<62:09:28,  1.29it/s]

{'loss': 24.0436, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:23<61:32:42,  1.30it/s]

{'loss': 23.7315, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:09<61:16:54,  1.30it/s]

{'loss': 23.3916, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:12:55<62:27:36,  1.27it/s]

{'loss': 23.8008, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:25:43<59:25:55,  1.33it/s]

{'loss': 23.7385, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:33<59:34:58,  1.32it/s]

{'loss': 23.3469, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:26<60:52:42,  1.29it/s]

{'loss': 23.1397, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:21<62:09:12,  1.26it/s]

{'loss': 24.1237, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:20<58:13:16,  1.34it/s]

{'loss': 23.549, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:17:57<50:36:58,  1.54it/s]
  7%|▋         | 20048/300720 [4:19:09<50:36:58,  1.54it/s]

{'eval_loss': 49.47114944458008, 'eval_mse': 0.1507994532585144, 'eval_rmse': 0.3882948160171509, 'eval_runtime': 72.1987, 'eval_samples_per_second': 62.328, 'eval_steps_per_second': 1.953, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:20<58:05:28,  1.34it/s]  

{'loss': 20.5057, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:09<58:20:33,  1.33it/s]

{'loss': 20.7353, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:56:55<58:15:14,  1.32it/s]

{'loss': 20.2588, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:44<56:12:11,  1.37it/s]

{'loss': 20.2812, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:32<59:22:14,  1.29it/s]

{'loss': 21.6344, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:17<58:37:59,  1.30it/s]

{'loss': 19.9988, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:06<57:47:25,  1.32it/s]

{'loss': 20.9452, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:00:55<57:22:12,  1.32it/s]

{'loss': 20.6948, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:13:48<61:28:07,  1.23it/s]

{'loss': 20.6874, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:42<58:30:31,  1.29it/s]

{'loss': 21.0536, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:27:38<50:50:45,  1.48it/s]
 10%|█         | 30072/300720 [6:28:50<50:50:45,  1.48it/s]

{'eval_loss': 52.29836654663086, 'eval_mse': 0.16196683049201965, 'eval_rmse': 0.4024507999420166, 'eval_runtime': 71.7347, 'eval_samples_per_second': 62.731, 'eval_steps_per_second': 1.966, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:40:42<57:46:22,  1.30it/s]  

{'loss': 17.7945, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:27<58:45:09,  1.27it/s]

{'loss': 16.7912, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:15<57:03:09,  1.30it/s]

{'loss': 16.9561, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:19:00<58:54:17,  1.26it/s]

{'loss': 17.5818, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:31:45<56:25:49,  1.31it/s]

{'loss': 17.1342, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:32<55:32:16,  1.32it/s]

{'loss': 17.304, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:21<56:05:35,  1.31it/s]

{'loss': 17.4121, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:11<56:17:42,  1.30it/s]

{'loss': 17.3031, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:03<55:57:36,  1.30it/s]

{'loss': 17.3201, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:35:58<53:39:36,  1.35it/s]

{'loss': 16.9698, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:37:13<48:21:27,  1.50it/s]
 13%|█▎        | 40096/300720 [8:38:24<48:21:27,  1.50it/s]

{'eval_loss': 52.60780715942383, 'eval_mse': 0.1613507866859436, 'eval_rmse': 0.4016602635383606, 'eval_runtime': 71.6229, 'eval_samples_per_second': 62.829, 'eval_steps_per_second': 1.969, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:26<56:09:51,  1.29it/s]


{'train_runtime': 31106.3828, 'train_samples_per_second': 309.331, 'train_steps_per_second': 9.667, 'train_loss': 22.519065838666886, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.07it/s]
[32m[I 2023-05-05 21:55:25,807][0m Trial 5 finished with value: 0.3882948160171509 and parameters: {'band_width': 0.32983068353578726}. Best is trial 4 with value: 0.38663142919540405.[0m


{'test_loss': 49.47114944458008, 'test_mse': 0.1507994532585144, 'test_rmse': 0.3882948160171509, 'test_runtime': 68.5619, 'test_samples_per_second': 65.634, 'test_steps_per_second': 2.057}
--------------------
bw= 0.22700654050603677
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 33.9315, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:31<63:41:27,  1.30it/s]

{'loss': 29.4912, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:17<64:23:52,  1.28it/s]

{'loss': 28.3862, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:04<63:26:28,  1.30it/s]

{'loss': 26.2495, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:50<61:25:53,  1.34it/s]

{'loss': 26.0533, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:37<60:49:50,  1.35it/s]

{'loss': 25.4212, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:26<63:10:47,  1.29it/s]

{'loss': 25.5984, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:18<62:42:35,  1.30it/s]

{'loss': 25.0871, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:13<61:21:15,  1.32it/s]

{'loss': 24.0797, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:06<63:27:27,  1.27it/s]

{'loss': 24.7401, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:24<53:20:17,  1.51it/s]
  3%|▎         | 10024/300720 [2:09:34<53:20:17,  1.51it/s]

{'eval_loss': 47.58884048461914, 'eval_mse': 0.1540203094482422, 'eval_rmse': 0.39235055446624756, 'eval_runtime': 69.3151, 'eval_samples_per_second': 64.921, 'eval_steps_per_second': 2.034, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:21:59<60:57:28,  1.32it/s]  

{'loss': 23.3403, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:41<61:56:12,  1.29it/s]

{'loss': 22.7672, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:25<60:52:10,  1.31it/s]

{'loss': 22.4449, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:10<60:50:21,  1.31it/s]

{'loss': 22.173, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:12:55<62:14:28,  1.28it/s]

{'loss': 22.2939, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:25:42<59:01:15,  1.34it/s]

{'loss': 22.171, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:31<60:03:36,  1.31it/s]

{'loss': 22.0727, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:23<60:35:16,  1.30it/s]

{'loss': 21.7779, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:17<61:31:26,  1.27it/s]

{'loss': 22.8633, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:15<58:43:47,  1.33it/s]

{'loss': 21.9821, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:17:52<50:31:15,  1.54it/s]
  7%|▋         | 20048/300720 [4:19:03<50:31:15,  1.54it/s]

{'eval_loss': 44.97845458984375, 'eval_mse': 0.14627110958099365, 'eval_rmse': 0.3824400305747986, 'eval_runtime': 70.7683, 'eval_samples_per_second': 63.588, 'eval_steps_per_second': 1.992, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:12<58:45:03,  1.32it/s]  

{'loss': 19.1421, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:00<58:33:01,  1.32it/s]

{'loss': 19.4243, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:56:47<58:07:17,  1.33it/s]

{'loss': 19.1345, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:35<56:38:51,  1.36it/s]

{'loss': 18.9779, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:22<59:33:53,  1.29it/s]

{'loss': 20.2362, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:07<57:40:14,  1.32it/s]

{'loss': 18.9157, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:47:57<57:36:44,  1.32it/s]

{'loss': 19.6822, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:00:46<57:47:40,  1.31it/s]

{'loss': 19.5579, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:13:40<61:03:21,  1.24it/s]

{'loss': 19.446, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:34<58:44:06,  1.28it/s]

{'loss': 19.7999, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:27:30<50:39:22,  1.48it/s]
 10%|█         | 30072/300720 [6:28:41<50:39:22,  1.48it/s]

{'eval_loss': 48.46246337890625, 'eval_mse': 0.15966512262821198, 'eval_rmse': 0.39958104491233826, 'eval_runtime': 71.8795, 'eval_samples_per_second': 62.605, 'eval_steps_per_second': 1.962, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:40:35<58:13:28,  1.29it/s]  

{'loss': 16.6021, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:18<58:33:18,  1.27it/s]

{'loss': 15.805, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:04<56:47:12,  1.31it/s]

{'loss': 15.698, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:18:49<58:46:22,  1.26it/s]

{'loss': 16.4146, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:31:36<56:00:25,  1.32it/s]

{'loss': 16.032, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:24<56:01:47,  1.31it/s]

{'loss': 16.2688, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:15<56:11:29,  1.30it/s]

{'loss': 16.3548, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:08<56:14:39,  1.30it/s]

{'loss': 16.1182, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:01<56:33:28,  1.29it/s]

{'loss': 16.0143, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:35:56<53:56:24,  1.34it/s]

{'loss': 15.8268, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:37:11<48:52:33,  1.48it/s]
 13%|█▎        | 40096/300720 [8:38:22<48:52:33,  1.48it/s]

{'eval_loss': 49.45500183105469, 'eval_mse': 0.164457768201828, 'eval_rmse': 0.4055195450782776, 'eval_runtime': 71.7113, 'eval_samples_per_second': 62.752, 'eval_steps_per_second': 1.966, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:24<56:09:38,  1.29it/s]


{'train_runtime': 31104.3773, 'train_samples_per_second': 309.351, 'train_steps_per_second': 9.668, 'train_loss': 21.19523612994341, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.07it/s]
[32m[I 2023-05-06 06:35:00,627][0m Trial 6 finished with value: 0.3824400305747986 and parameters: {'band_width': 0.22700654050603677}. Best is trial 6 with value: 0.3824400305747986.[0m


{'test_loss': 44.97845458984375, 'test_mse': 0.14627110958099365, 'test_rmse': 0.3824400305747986, 'test_runtime': 68.5355, 'test_samples_per_second': 65.659, 'test_steps_per_second': 2.057}
--------------------
bw= 0.23319315914880775
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 34.0276, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:34<63:51:08,  1.30it/s]

{'loss': 29.6061, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:22<64:12:07,  1.29it/s]

{'loss': 28.4061, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:11<63:12:08,  1.30it/s]

{'loss': 26.3467, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:59<61:29:06,  1.34it/s]

{'loss': 26.1767, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:46<61:22:32,  1.33it/s]

{'loss': 25.4156, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:37<63:32:12,  1.28it/s]

{'loss': 25.7995, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:30<62:42:30,  1.30it/s]

{'loss': 25.1837, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:25<61:06:02,  1.33it/s]

{'loss': 24.2458, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:18<63:37:36,  1.27it/s]

{'loss': 24.8655, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:36<53:08:50,  1.52it/s]
  3%|▎         | 10024/300720 [2:09:46<53:08:50,  1.52it/s]

{'eval_loss': 47.650901794433594, 'eval_mse': 0.15323695540428162, 'eval_rmse': 0.3913269340991974, 'eval_runtime': 69.4094, 'eval_samples_per_second': 64.833, 'eval_steps_per_second': 2.031, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:13<61:11:57,  1.32it/s]  

{'loss': 23.4771, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:56<61:54:26,  1.30it/s]

{'loss': 22.8869, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:43<60:48:17,  1.31it/s]

{'loss': 22.4874, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:30<60:30:01,  1.32it/s]

{'loss': 22.2563, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:16<62:24:59,  1.27it/s]

{'loss': 22.4036, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:26:05<58:31:11,  1.35it/s]

{'loss': 22.2408, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:55<59:33:04,  1.32it/s]

{'loss': 22.2045, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:47<60:45:46,  1.29it/s]

{'loss': 21.9627, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:41<62:02:14,  1.26it/s]

{'loss': 23.0159, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:39<58:34:10,  1.33it/s]

{'loss': 22.1473, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:18:16<50:49:29,  1.53it/s]
  7%|▋         | 20048/300720 [4:19:26<50:49:29,  1.53it/s]

{'eval_loss': 46.284400939941406, 'eval_mse': 0.15031331777572632, 'eval_rmse': 0.3876585364341736, 'eval_runtime': 70.5783, 'eval_samples_per_second': 63.759, 'eval_steps_per_second': 1.998, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:36<57:58:35,  1.34it/s]  

{'loss': 19.2482, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:24<58:59:41,  1.31it/s]

{'loss': 19.4016, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:10<59:09:50,  1.30it/s]

{'loss': 19.3475, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:58<56:32:47,  1.36it/s]

{'loss': 19.0539, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:47<59:33:34,  1.29it/s]

{'loss': 20.4674, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:34<58:09:47,  1.31it/s]

{'loss': 18.9509, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:24<58:03:26,  1.31it/s]

{'loss': 19.8174, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:01:14<58:29:25,  1.30it/s]

{'loss': 19.4496, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:14:08<60:35:14,  1.25it/s]

{'loss': 19.4709, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:27:02<58:07:33,  1.29it/s]

{'loss': 19.995, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:27:58<50:40:22,  1.48it/s]
 10%|█         | 30072/300720 [6:29:09<50:40:22,  1.48it/s]

{'eval_loss': 48.82172775268555, 'eval_mse': 0.16033190488815308, 'eval_rmse': 0.4004131555557251, 'eval_runtime': 71.7518, 'eval_samples_per_second': 62.716, 'eval_steps_per_second': 1.965, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:41:02<58:28:09,  1.28it/s]  

{'loss': 16.6808, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:45<58:52:22,  1.27it/s]

{'loss': 15.7764, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:33<56:42:17,  1.31it/s]

{'loss': 15.7645, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:19:19<59:33:33,  1.24it/s]

{'loss': 16.6317, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:32:07<56:29:59,  1.31it/s]

{'loss': 16.1174, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:55<56:16:39,  1.31it/s]

{'loss': 16.368, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:46<55:56:25,  1.31it/s]

{'loss': 16.4573, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:39<55:59:55,  1.30it/s]

{'loss': 16.1335, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:33<56:24:51,  1.29it/s]

{'loss': 16.0491, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:36:29<54:05:56,  1.34it/s]

{'loss': 15.9236, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:37:43<48:33:23,  1.49it/s]
 13%|█▎        | 40096/300720 [8:38:55<48:33:23,  1.49it/s]

{'eval_loss': 49.737667083740234, 'eval_mse': 0.1636926382780075, 'eval_rmse': 0.4045749008655548, 'eval_runtime': 71.6043, 'eval_samples_per_second': 62.845, 'eval_steps_per_second': 1.969, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:57<56:13:11,  1.29it/s]


{'train_runtime': 31137.2008, 'train_samples_per_second': 309.025, 'train_steps_per_second': 9.658, 'train_loss': 21.29247240704341, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.07it/s]
[32m[I 2023-05-06 15:15:08,774][0m Trial 7 finished with value: 0.3876585364341736 and parameters: {'band_width': 0.23319315914880775}. Best is trial 6 with value: 0.3824400305747986.[0m


{'test_loss': 46.284400939941406, 'test_mse': 0.15031331777572632, 'test_rmse': 0.3876585364341736, 'test_runtime': 68.5358, 'test_samples_per_second': 65.659, 'test_steps_per_second': 2.057}
--------------------
bw= 0.14333165606190812
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 32.5273, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:33<64:19:39,  1.29it/s]

{'loss': 28.2868, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:19<64:08:10,  1.29it/s]

{'loss': 27.4013, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:07<62:29:57,  1.32it/s]

{'loss': 25.2494, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:03:55<61:45:10,  1.33it/s]

{'loss': 25.1116, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:42<61:26:31,  1.33it/s]

{'loss': 24.6087, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:34<63:12:58,  1.29it/s]

{'loss': 24.3778, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:28<63:06:25,  1.29it/s]

{'loss': 24.2293, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:23<61:00:34,  1.33it/s]

{'loss': 23.2243, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:17<63:30:30,  1.27it/s]

{'loss': 23.9782, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:35<52:37:39,  1.53it/s]
  3%|▎         | 10024/300720 [2:09:44<52:37:39,  1.53it/s]

{'eval_loss': 45.144527435302734, 'eval_mse': 0.15793555974960327, 'eval_rmse': 0.3973144590854645, 'eval_runtime': 68.5178, 'eval_samples_per_second': 65.676, 'eval_steps_per_second': 2.058, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:11<61:53:02,  1.30it/s]  

{'loss': 22.4183, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:34:55<61:54:33,  1.30it/s]

{'loss': 21.8374, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:41<60:20:03,  1.32it/s]

{'loss': 21.4679, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:28<61:03:59,  1.30it/s]

{'loss': 21.3712, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:13<62:41:53,  1.27it/s]

{'loss': 21.25, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:25:59<59:16:15,  1.33it/s]

{'loss': 21.0727, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:49<59:41:13,  1.32it/s]

{'loss': 21.3523, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:42<61:32:32,  1.28it/s]

{'loss': 20.8115, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:36<61:27:46,  1.27it/s]

{'loss': 22.0204, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:34<58:50:41,  1.33it/s]

{'loss': 21.0752, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:18:11<50:36:33,  1.54it/s]
  7%|▋         | 20048/300720 [4:19:21<50:36:33,  1.54it/s]

{'eval_loss': 41.928951263427734, 'eval_mse': 0.1460915505886078, 'eval_rmse': 0.38221901655197144, 'eval_runtime': 70.3041, 'eval_samples_per_second': 64.008, 'eval_steps_per_second': 2.006, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:29<58:07:44,  1.34it/s]  

{'loss': 18.2257, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:15<58:14:20,  1.33it/s]

{'loss': 18.3002, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:00<58:27:10,  1.32it/s]

{'loss': 18.4428, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:09:46<56:17:23,  1.37it/s]

{'loss': 18.1678, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:33<59:13:25,  1.29it/s]

{'loss': 19.2747, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:19<57:59:12,  1.32it/s]

{'loss': 18.2072, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:09<57:50:03,  1.31it/s]

{'loss': 18.7057, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:00:59<57:55:21,  1.31it/s]

{'loss': 18.6577, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:13:53<61:09:56,  1.23it/s]

{'loss': 18.3902, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:26:48<58:49:07,  1.28it/s]

{'loss': 19.0116, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:27:44<51:07:07,  1.47it/s]
 10%|█         | 30072/300720 [6:28:56<51:07:07,  1.47it/s]

{'eval_loss': 45.67074203491211, 'eval_mse': 0.1617259532213211, 'eval_rmse': 0.4021416902542114, 'eval_runtime': 71.9906, 'eval_samples_per_second': 62.508, 'eval_steps_per_second': 1.959, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:40:49<58:15:45,  1.29it/s]  

{'loss': 15.727, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:35<59:25:29,  1.26it/s]

{'loss': 15.0817, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:23<56:14:04,  1.32it/s]

{'loss': 14.9, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:19:10<58:59:45,  1.26it/s]

{'loss': 15.586, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:31:57<56:06:30,  1.32it/s]

{'loss': 15.2622, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:44:45<56:37:17,  1.30it/s]

{'loss': 15.3952, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:57:35<56:13:56,  1.30it/s]

{'loss': 15.4139, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:26<55:52:54,  1.31it/s]

{'loss': 15.2104, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:19<56:17:40,  1.29it/s]

{'loss': 15.2518, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:36:15<54:01:00,  1.34it/s]

{'loss': 15.0907, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:37:29<48:55:40,  1.48it/s]
 13%|█▎        | 40096/300720 [8:38:41<48:55:40,  1.48it/s]

{'eval_loss': 45.229793548583984, 'eval_mse': 0.1601008176803589, 'eval_rmse': 0.4000428318977356, 'eval_runtime': 71.8228, 'eval_samples_per_second': 62.654, 'eval_steps_per_second': 1.963, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:38:42<56:11:38,  1.29it/s]


{'train_runtime': 31122.9243, 'train_samples_per_second': 309.167, 'train_steps_per_second': 9.662, 'train_loss': 20.28556605443323, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.06it/s]
[32m[I 2023-05-06 23:55:02,915][0m Trial 8 finished with value: 0.38221901655197144 and parameters: {'band_width': 0.14333165606190812}. Best is trial 8 with value: 0.38221901655197144.[0m


{'test_loss': 41.928951263427734, 'test_mse': 0.1460915505886078, 'test_rmse': 0.38221901655197144, 'test_runtime': 68.8, 'test_samples_per_second': 65.407, 'test_steps_per_second': 2.049}
--------------------
bw= 0.32142504978306385
--------------------


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1000/300720 [12

{'loss': 35.9487, 'learning_rate': 9.966746475126364e-06, 'epoch': 0.1}


  1%|          | 2000/300720 [25:35<63:21:02,  1.31it/s]

{'loss': 30.8871, 'learning_rate': 9.933492950252728e-06, 'epoch': 0.2}


  1%|          | 3000/300720 [38:23<64:06:19,  1.29it/s]

{'loss': 29.809, 'learning_rate': 9.900239425379091e-06, 'epoch': 0.3}


  1%|▏         | 4000/300720 [51:11<62:51:04,  1.31it/s]

{'loss': 27.6653, 'learning_rate': 9.866985900505454e-06, 'epoch': 0.4}


  2%|▏         | 5000/300720 [1:04:00<62:22:23,  1.32it/s]

{'loss': 27.5143, 'learning_rate': 9.833732375631819e-06, 'epoch': 0.5}


  2%|▏         | 6000/300720 [1:16:48<61:43:13,  1.33it/s]

{'loss': 26.4191, 'learning_rate': 9.800478850758181e-06, 'epoch': 0.6}


  2%|▏         | 7000/300720 [1:29:39<63:18:43,  1.29it/s]

{'loss': 27.0422, 'learning_rate': 9.767225325884544e-06, 'epoch': 0.7}


  3%|▎         | 8000/300720 [1:42:33<62:48:35,  1.29it/s]

{'loss': 26.4311, 'learning_rate': 9.733971801010907e-06, 'epoch': 0.8}


  3%|▎         | 9000/300720 [1:55:28<61:17:38,  1.32it/s]

{'loss': 25.4178, 'learning_rate': 9.70071827613727e-06, 'epoch': 0.9}


  3%|▎         | 10000/300720 [2:08:22<62:57:47,  1.28it/s]

{'loss': 26.0005, 'learning_rate': 9.667464751263635e-06, 'epoch': 1.0}


  3%|▎         | 10024/300720 [2:08:40<53:29:52,  1.51it/s]
  3%|▎         | 10024/300720 [2:09:49<53:29:52,  1.51it/s]

{'eval_loss': 50.0517692565918, 'eval_mse': 0.15135803818702698, 'eval_rmse': 0.3889555335044861, 'eval_runtime': 68.5961, 'eval_samples_per_second': 65.601, 'eval_steps_per_second': 2.056, 'epoch': 1.0}


  4%|▎         | 11000/300720 [2:22:16<61:02:51,  1.32it/s]  

{'loss': 24.439, 'learning_rate': 9.634211226389998e-06, 'epoch': 1.1}


  4%|▍         | 12000/300720 [2:35:01<62:11:41,  1.29it/s]

{'loss': 23.9412, 'learning_rate': 9.600957701516362e-06, 'epoch': 1.2}


  4%|▍         | 13000/300720 [2:47:46<60:37:11,  1.32it/s]

{'loss': 23.5808, 'learning_rate': 9.567704176642725e-06, 'epoch': 1.3}


  5%|▍         | 14000/300720 [3:00:31<60:18:43,  1.32it/s]

{'loss': 23.2597, 'learning_rate': 9.534450651769088e-06, 'epoch': 1.4}


  5%|▍         | 15000/300720 [3:13:17<62:45:10,  1.26it/s]

{'loss': 23.5821, 'learning_rate': 9.501197126895452e-06, 'epoch': 1.5}


  5%|▌         | 16000/300720 [3:26:05<59:09:37,  1.34it/s]

{'loss': 23.5169, 'learning_rate': 9.467943602021815e-06, 'epoch': 1.6}


  6%|▌         | 17000/300720 [3:38:55<59:38:36,  1.32it/s]

{'loss': 23.204, 'learning_rate': 9.434690077148178e-06, 'epoch': 1.7}


  6%|▌         | 18000/300720 [3:51:49<61:34:17,  1.28it/s]

{'loss': 23.0102, 'learning_rate': 9.401436552274543e-06, 'epoch': 1.8}


  6%|▋         | 19000/300720 [4:04:43<61:48:42,  1.27it/s]

{'loss': 24.0123, 'learning_rate': 9.368183027400906e-06, 'epoch': 1.9}


  7%|▋         | 20000/300720 [4:17:41<58:20:08,  1.34it/s]

{'loss': 23.4043, 'learning_rate': 9.334929502527269e-06, 'epoch': 2.0}


  7%|▋         | 20048/300720 [4:18:18<50:25:15,  1.55it/s]
  7%|▋         | 20048/300720 [4:19:28<50:25:15,  1.55it/s]

{'eval_loss': 49.61272048950195, 'eval_mse': 0.15236149728298187, 'eval_rmse': 0.39029252529144287, 'eval_runtime': 69.6788, 'eval_samples_per_second': 64.582, 'eval_steps_per_second': 2.024, 'epoch': 2.0}


  7%|▋         | 21000/300720 [4:31:38<58:31:50,  1.33it/s]  

{'loss': 20.3114, 'learning_rate': 9.301675977653633e-06, 'epoch': 2.09}


  7%|▋         | 22000/300720 [4:44:26<58:36:52,  1.32it/s]

{'loss': 20.5302, 'learning_rate': 9.268422452779996e-06, 'epoch': 2.19}


  8%|▊         | 23000/300720 [4:57:13<59:06:51,  1.31it/s]

{'loss': 20.108, 'learning_rate': 9.235168927906359e-06, 'epoch': 2.29}


  8%|▊         | 24000/300720 [5:10:02<56:21:18,  1.36it/s]

{'loss': 20.12, 'learning_rate': 9.201915403032722e-06, 'epoch': 2.39}


  8%|▊         | 25000/300720 [5:22:51<59:09:32,  1.29it/s]

{'loss': 21.4336, 'learning_rate': 9.168661878159085e-06, 'epoch': 2.49}


  9%|▊         | 26000/300720 [5:35:40<58:22:13,  1.31it/s]

{'loss': 19.8246, 'learning_rate': 9.13540835328545e-06, 'epoch': 2.59}


  9%|▉         | 27000/300720 [5:48:31<57:54:18,  1.31it/s]

{'loss': 20.9188, 'learning_rate': 9.102154828411812e-06, 'epoch': 2.69}


  9%|▉         | 28000/300720 [6:01:21<58:19:50,  1.30it/s]

{'loss': 20.4928, 'learning_rate': 9.068901303538175e-06, 'epoch': 2.79}


 10%|▉         | 29000/300720 [6:14:16<60:27:37,  1.25it/s]

{'loss': 20.576, 'learning_rate': 9.03564777866454e-06, 'epoch': 2.89}


 10%|▉         | 30000/300720 [6:27:11<59:26:23,  1.27it/s]

{'loss': 20.9234, 'learning_rate': 9.002394253790902e-06, 'epoch': 2.99}


 10%|█         | 30072/300720 [6:28:07<51:01:06,  1.47it/s]
 10%|█         | 30072/300720 [6:29:19<51:01:06,  1.47it/s]

{'eval_loss': 52.12964630126953, 'eval_mse': 0.16255101561546326, 'eval_rmse': 0.4031730890274048, 'eval_runtime': 72.0343, 'eval_samples_per_second': 62.47, 'eval_steps_per_second': 1.957, 'epoch': 3.0}


 10%|█         | 31000/300720 [6:41:13<58:16:03,  1.29it/s]  

{'loss': 17.6252, 'learning_rate': 8.969140728917265e-06, 'epoch': 3.09}


 11%|█         | 32000/300720 [6:53:59<58:22:58,  1.28it/s]

{'loss': 16.6024, 'learning_rate': 8.93588720404363e-06, 'epoch': 3.19}


 11%|█         | 33000/300720 [7:06:49<56:30:10,  1.32it/s]

{'loss': 16.908, 'learning_rate': 8.902633679169993e-06, 'epoch': 3.29}


 11%|█▏        | 34000/300720 [7:19:36<59:15:47,  1.25it/s]

{'loss': 17.4226, 'learning_rate': 8.869380154296357e-06, 'epoch': 3.39}


 12%|█▏        | 35000/300720 [7:32:25<56:02:12,  1.32it/s]

{'loss': 16.9641, 'learning_rate': 8.83612662942272e-06, 'epoch': 3.49}


 12%|█▏        | 36000/300720 [7:45:13<56:30:04,  1.30it/s]

{'loss': 17.2431, 'learning_rate': 8.802873104549083e-06, 'epoch': 3.59}


 12%|█▏        | 37000/300720 [7:58:05<56:14:53,  1.30it/s]

{'loss': 17.4078, 'learning_rate': 8.769619579675448e-06, 'epoch': 3.69}


 13%|█▎        | 38000/300720 [8:10:58<56:07:52,  1.30it/s]

{'loss': 17.0988, 'learning_rate': 8.736366054801809e-06, 'epoch': 3.79}


 13%|█▎        | 39000/300720 [8:23:52<56:18:04,  1.29it/s]

{'loss': 17.0858, 'learning_rate': 8.703112529928173e-06, 'epoch': 3.89}


 13%|█▎        | 40000/300720 [8:36:48<53:57:32,  1.34it/s]

{'loss': 16.8279, 'learning_rate': 8.669859005054536e-06, 'epoch': 3.99}


 13%|█▎        | 40096/300720 [8:38:03<49:00:30,  1.48it/s]
 13%|█▎        | 40096/300720 [8:39:15<49:00:30,  1.48it/s]

{'eval_loss': 52.15571975708008, 'eval_mse': 0.16155248880386353, 'eval_rmse': 0.40188953280448914, 'eval_runtime': 71.9404, 'eval_samples_per_second': 62.552, 'eval_steps_per_second': 1.96, 'epoch': 4.0}


 13%|█▎        | 40096/300720 [8:39:16<56:15:18,  1.29it/s]


{'train_runtime': 31156.7562, 'train_samples_per_second': 308.831, 'train_steps_per_second': 9.652, 'train_loss': 22.372688893213905, 'epoch': 4.0}


100%|██████████| 141/141 [01:08<00:00,  2.06it/s]
[32m[I 2023-05-07 08:35:30,895][0m Trial 9 finished with value: 0.39029252529144287 and parameters: {'band_width': 0.32142504978306385}. Best is trial 8 with value: 0.38221901655197144.[0m


{'test_loss': 49.61272048950195, 'test_mse': 0.15236149728298187, 'test_rmse': 0.39029252529144287, 'test_runtime': 68.77, 'test_samples_per_second': 65.436, 'test_steps_per_second': 2.05}


In [17]:
study.best_params

{'band_width': 0.14333165606190812}